diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
61 files changed, 6277 insertions, 4675 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index 1e683db50206..ce09a477b5f5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -174,8 +174,8 @@ class AggressiveDeadCodeElimination { /// marked live. void markLiveBranchesFromControlDependences(); - /// Remove instructions not marked live, return if any any instruction - /// was removed. + /// Remove instructions not marked live, return if any instruction was + /// removed. bool removeDeadInstructions(); /// Identify connected sections of the control flow graph which have @@ -298,8 +298,8 @@ void AggressiveDeadCodeElimination::initialize() { auto &Info = BlockInfo[BB]; // Real function return if (isa<ReturnInst>(Info.Terminator)) { - DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName() - << '\n';); + LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName() + << '\n';); continue; } @@ -356,7 +356,7 @@ void AggressiveDeadCodeElimination::markLiveInstructions() { // where we need to mark the inputs as live. while (!Worklist.empty()) { Instruction *LiveInst = Worklist.pop_back_val(); - DEBUG(dbgs() << "work live: "; LiveInst->dump();); + LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump();); for (Use &OI : LiveInst->operands()) if (Instruction *Inst = dyn_cast<Instruction>(OI)) @@ -378,7 +378,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) { if (Info.Live) return; - DEBUG(dbgs() << "mark live: "; I->dump()); + LLVM_DEBUG(dbgs() << "mark live: "; I->dump()); Info.Live = true; Worklist.push_back(I); @@ -402,7 +402,7 @@ void AggressiveDeadCodeElimination::markLive(Instruction *I) { void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) { if (BBInfo.Live) return; - DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n'); BBInfo.Live = true; if (!BBInfo.CFLive) { BBInfo.CFLive = true; @@ -463,7 +463,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() { if (BlocksWithDeadTerminators.empty()) return; - DEBUG({ + LLVM_DEBUG({ dbgs() << "new live blocks:\n"; for (auto *BB : NewLiveBlocks) dbgs() << "\t" << BB->getName() << '\n'; @@ -487,7 +487,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() { // Dead terminators which control live blocks are now marked live. for (auto *BB : IDFBlocks) { - DEBUG(dbgs() << "live control in: " << BB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n'); markLive(BB->getTerminator()); } } @@ -501,7 +501,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() { // Updates control and dataflow around dead blocks updateDeadRegions(); - DEBUG({ + LLVM_DEBUG({ for (Instruction &I : instructions(F)) { // Check if the instruction is alive. if (isLive(&I)) @@ -555,7 +555,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() { // A dead region is the set of dead blocks with a common live post-dominator. void AggressiveDeadCodeElimination::updateDeadRegions() { - DEBUG({ + LLVM_DEBUG({ dbgs() << "final dead terminator blocks: " << '\n'; for (auto *BB : BlocksWithDeadTerminators) dbgs() << '\t' << BB->getName() @@ -607,8 +607,9 @@ void AggressiveDeadCodeElimination::updateDeadRegions() { // It might have happened that the same successor appeared multiple times // and the CFG edge wasn't really removed. if (Succ != PreferredSucc->BB) { - DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion" - << BB->getName() << " -> " << Succ->getName() << "\n"); + LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion" + << BB->getName() << " -> " << Succ->getName() + << "\n"); DeletedEdges.push_back({DominatorTree::Delete, BB, Succ}); } } @@ -652,7 +653,7 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB, InstInfo[PredTerm].Live = true; return; } - DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n'); NumBranchesRemoved += 1; IRBuilder<> Builder(PredTerm); auto *NewTerm = Builder.CreateBr(Target); diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 99480f12da9e..fa7bcec677f7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -98,8 +98,8 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV); const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV); - DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " << - *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); + LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " + << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); if (const SCEVConstant *ConstDUSCEV = dyn_cast<SCEVConstant>(DiffUnitsSCEV)) { @@ -139,12 +139,12 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, // address. This address is displaced by the provided offset. DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV); - DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " << - *AlignSCEV << " and offset " << *OffSCEV << - " using diff " << *DiffSCEV << "\n"); + LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " + << *AlignSCEV << " and offset " << *OffSCEV + << " using diff " << *DiffSCEV << "\n"); unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE); - DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n"); if (NewAlignment) { return NewAlignment; @@ -160,8 +160,8 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, const SCEV *DiffStartSCEV = DiffARSCEV->getStart(); const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE); - DEBUG(dbgs() << "\ttrying start/inc alignment using start " << - *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n"); + LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start " + << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n"); // Now compute the new alignment using the displacement to the value in the // first iteration, and also the alignment using the per-iteration delta. @@ -170,26 +170,26 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE); unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE); - DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n"); - DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n"); if (!NewAlignment || !NewIncAlignment) { return 0; } else if (NewAlignment > NewIncAlignment) { if (NewAlignment % NewIncAlignment == 0) { - DEBUG(dbgs() << "\tnew start/inc alignment: " << - NewIncAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewIncAlignment + << "\n"); return NewIncAlignment; } } else if (NewIncAlignment > NewAlignment) { if (NewIncAlignment % NewAlignment == 0) { - DEBUG(dbgs() << "\tnew start/inc alignment: " << - NewAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment + << "\n"); return NewAlignment; } } else if (NewIncAlignment == NewAlignment) { - DEBUG(dbgs() << "\tnew start/inc alignment: " << - NewAlignment << "\n"); + LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment + << "\n"); return NewAlignment; } } @@ -339,55 +339,24 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE); - // For memory transfers, we need a common alignment for both the - // source and destination. If we have a new alignment for this - // instruction, but only for one operand, save it. If we reach the - // other operand through another assumption later, then we may - // change the alignment at that point. + LLVM_DEBUG(dbgs() << "\tmem inst: " << NewDestAlignment << "\n";); + if (NewDestAlignment > MI->getDestAlignment()) { + MI->setDestAlignment(NewDestAlignment); + ++NumMemIntAlignChanged; + } + + // For memory transfers, there is also a source alignment that + // can be set. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE); - DenseMap<MemTransferInst *, unsigned>::iterator DI = - NewDestAlignments.find(MTI); - unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ? - 0 : DI->second; - - DenseMap<MemTransferInst *, unsigned>::iterator SI = - NewSrcAlignments.find(MTI); - unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ? - 0 : SI->second; - - DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " << - AltDestAlignment << " " << NewSrcAlignment << - " " << AltSrcAlignment << "\n"); - - // Of these four alignments, pick the largest possible... - unsigned NewAlignment = 0; - if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment)) - NewAlignment = std::max(NewAlignment, NewDestAlignment); - if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment)) - NewAlignment = std::max(NewAlignment, AltDestAlignment); - if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment)) - NewAlignment = std::max(NewAlignment, NewSrcAlignment); - if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment)) - NewAlignment = std::max(NewAlignment, AltSrcAlignment); - - if (NewAlignment > MI->getAlignment()) { - MI->setAlignment(ConstantInt::get(Type::getInt32Ty( - MI->getParent()->getContext()), NewAlignment)); + LLVM_DEBUG(dbgs() << "\tmem trans: " << NewSrcAlignment << "\n";); + + if (NewSrcAlignment > MTI->getSourceAlignment()) { + MTI->setSourceAlignment(NewSrcAlignment); ++NumMemIntAlignChanged; } - - NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment)); - NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment)); - } else if (NewDestAlignment > MI->getAlignment()) { - assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) && - "Unknown memory intrinsic"); - - MI->setAlignment(ConstantInt::get(Type::getInt32Ty( - MI->getParent()->getContext()), NewDestAlignment)); - ++NumMemIntAlignChanged; } } @@ -421,9 +390,6 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC, SE = SE_; DT = DT_; - NewDestAlignments.clear(); - NewSrcAlignments.clear(); - bool Changed = false; for (auto &AssumeVH : AC.assumptions()) if (AssumeVH) diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 851efa000f65..3a8ef073cb48 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" @@ -99,7 +100,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { // For live instructions that have all dead bits, first make them dead by // replacing all uses with something else. Then, if they don't need to // remain live (because they have side effects, etc.) we can remove them. - DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); clearAssumptionsOfUsers(&I, DB); @@ -114,6 +115,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { if (!DB.isInstructionDead(&I)) continue; + salvageDebugInfo(I); Worklist.push_back(&I); I.dropAllReferences(); Changed = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index 7488cd5af8be..5ebfbf8a879b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -59,12 +59,14 @@ #include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; using namespace PatternMatch; @@ -73,9 +75,16 @@ using namespace PatternMatch; STATISTIC(NumCallSiteSplit, "Number of call-site split"); -static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI, - Value *Op) { - CallSite CS(NewCallI); +/// Only allow instructions before a call, if their CodeSize cost is below +/// DuplicationThreshold. Those instructions need to be duplicated in all +/// split blocks. +static cl::opt<unsigned> + DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden, + cl::desc("Only allow instructions before a call, if " + "their cost is below DuplicationThreshold"), + cl::init(5)); + +static void addNonNullAttribute(CallSite CS, Value *Op) { unsigned ArgNo = 0; for (auto &I : CS.args()) { if (&*I == Op) @@ -84,13 +93,16 @@ static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI, } } -static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI, - Value *Op, Constant *ConstValue) { - CallSite CS(NewCallI); +static void setConstantInArgument(CallSite CS, Value *Op, + Constant *ConstValue) { unsigned ArgNo = 0; for (auto &I : CS.args()) { - if (&*I == Op) + if (&*I == Op) { + // It is possible we have already added the non-null attribute to the + // parameter by using an earlier constraining condition. + CS.removeParamAttr(ArgNo, Attribute::NonNull); CS.setArgument(ArgNo, ConstValue); + } ++ArgNo; } } @@ -111,11 +123,13 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { return false; } +typedef std::pair<ICmpInst *, unsigned> ConditionTy; +typedef SmallVector<ConditionTy, 2> ConditionsTy; + /// If From has a conditional jump to To, add the condition to Conditions, /// if it is relevant to any argument at CS. -static void -recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To, - SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { +static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To, + ConditionsTy &Conditions) { auto *BI = dyn_cast<BranchInst>(From->getTerminator()); if (!BI || !BI->isConditional()) return; @@ -134,11 +148,10 @@ recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To, } /// Record ICmp conditions relevant to any argument in CS following Pred's -/// single successors. If there are conflicting conditions along a path, like +/// single predecessors. If there are conflicting conditions along a path, like /// x == 1 and x == 0, the first condition will be used. -static void -recordConditions(const CallSite &CS, BasicBlock *Pred, - SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { +static void recordConditions(CallSite CS, BasicBlock *Pred, + ConditionsTy &Conditions) { recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions); BasicBlock *From = Pred; BasicBlock *To = Pred; @@ -151,24 +164,17 @@ recordConditions(const CallSite &CS, BasicBlock *Pred, } } -static Instruction * -addConditions(CallSite &CS, - SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) { - if (Conditions.empty()) - return nullptr; - - Instruction *NewCI = CS.getInstruction()->clone(); +static void addConditions(CallSite CS, const ConditionsTy &Conditions) { for (auto &Cond : Conditions) { Value *Arg = Cond.first->getOperand(0); Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1)); if (Cond.second == ICmpInst::ICMP_EQ) - setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal); + setConstantInArgument(CS, Arg, ConstVal); else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { assert(Cond.second == ICmpInst::ICMP_NE); - addNonNullAttribute(CS.getInstruction(), NewCI, Arg); + addNonNullAttribute(CS, Arg); } } - return NewCI; } static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) { @@ -177,28 +183,39 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) { return Preds; } -static bool canSplitCallSite(CallSite CS) { +static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) { // FIXME: As of now we handle only CallInst. InvokeInst could be handled // without too much effort. Instruction *Instr = CS.getInstruction(); if (!isa<CallInst>(Instr)) return false; - // Allow splitting a call-site only when there is no instruction before the - // call-site in the basic block. Based on this constraint, we only clone the - // call instruction, and we do not move a call-site across any other - // instruction. BasicBlock *CallSiteBB = Instr->getParent(); - if (Instr != CallSiteBB->getFirstNonPHIOrDbg()) - return false; - // Need 2 predecessors and cannot split an edge from an IndirectBrInst. SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB)); if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) || isa<IndirectBrInst>(Preds[1]->getTerminator())) return false; - return CallSiteBB->canSplitPredecessors(); + // BasicBlock::canSplitPredecessors is more agressive, so checking for + // BasicBlock::isEHPad as well. + if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad()) + return false; + + // Allow splitting a call-site only when the CodeSize cost of the + // instructions before the call is less then DuplicationThreshold. The + // instructions before the call will be duplicated in the split blocks and + // corresponding uses will be updated. + unsigned Cost = 0; + for (auto &InstBeforeCall : + llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) { + Cost += TTI.getInstructionCost(&InstBeforeCall, + TargetTransformInfo::TCK_CodeSize); + if (Cost >= DuplicationThreshold) + return false; + } + + return true; } static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before, @@ -224,11 +241,11 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy(); auto II = std::next(CI->getIterator()); - BitCastInst *BCI = dyn_cast<BitCastInst>(&*II); + BitCastInst* BCI = dyn_cast<BitCastInst>(&*II); if (BCI) ++II; - ReturnInst *RI = dyn_cast<ReturnInst>(&*II); + ReturnInst* RI = dyn_cast<ReturnInst>(&*II); assert(RI && "`musttail` call must be followed by `ret` instruction"); TerminatorInst *TI = SplitBB->getTerminator(); @@ -241,14 +258,15 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, // that prevents doing this now. } -/// Return true if the CS is split into its new predecessors which are directly -/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2. -/// CallInst1 and CallInst2 will be the new call-sites placed in the new -/// predecessors split for PredBB1 and PredBB2, respectively. +/// For each (predecessor, conditions from predecessors) pair, it will split the +/// basic block containing the call site, hook it up to the predecessor and +/// replace the call instruction with new call instructions, which contain +/// constraints based on the conditions from their predecessors. /// For example, in the IR below with an OR condition, the call-site can -/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the -/// call-site placed between Header and Tail, and CallInst2 will be the -/// call-site between TBB and Tail. +/// be split. In this case, Preds for Tail is [(Header, a == null), +/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing +/// CallInst1, which has constraints based on the conditions from Head and +/// CallInst2, which has constraints based on the conditions coming from TBB. /// /// From : /// @@ -281,61 +299,59 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, /// Note that in case any arguments at the call-site are constrained by its /// predecessors, new call-sites with more constrained arguments will be /// created in createCallSitesOnPredicatedArgument(). -static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, - Instruction *CallInst1, Instruction *CallInst2) { +static void splitCallSite( + CallSite CS, + const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds, + DominatorTree *DT) { Instruction *Instr = CS.getInstruction(); BasicBlock *TailBB = Instr->getParent(); bool IsMustTailCall = CS.isMustTailCall(); - assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site"); - - BasicBlock *SplitBlock1 = - SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); - BasicBlock *SplitBlock2 = - SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); - - assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); - if (!CallInst1) - CallInst1 = Instr->clone(); - if (!CallInst2) - CallInst2 = Instr->clone(); - - CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); - CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); - - CallSite CS1(CallInst1); - CallSite CS2(CallInst2); - - // Handle PHIs used as arguments in the call-site. - for (PHINode &PN : TailBB->phis()) { - unsigned ArgNo = 0; - for (auto &CI : CS.args()) { - if (&*CI == &PN) { - CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1)); - CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2)); + PHINode *CallPN = nullptr; + + // `musttail` calls must be followed by optional `bitcast`, and `ret`. The + // split blocks will be terminated right after that so there're no users for + // this phi in a `TailBB`. + if (!IsMustTailCall && !Instr->use_empty()) + CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call"); + + LLVM_DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + + assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2."); + // ValueToValueMapTy is neither copy nor moveable, so we use a simple array + // here. + ValueToValueMapTy ValueToValueMaps[2]; + for (unsigned i = 0; i < Preds.size(); i++) { + BasicBlock *PredBB = Preds[i].first; + BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween( + TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i], + DT); + assert(SplitBlock && "Unexpected new basic block split."); + + Instruction *NewCI = + &*std::prev(SplitBlock->getTerminator()->getIterator()); + CallSite NewCS(NewCI); + addConditions(NewCS, Preds[i].second); + + // Handle PHIs used as arguments in the call-site. + for (PHINode &PN : TailBB->phis()) { + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == &PN) { + NewCS.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock)); + } + ++ArgNo; } - ++ArgNo; } + LLVM_DEBUG(dbgs() << " " << *NewCI << " in " << SplitBlock->getName() + << "\n"); + if (CallPN) + CallPN->addIncoming(NewCI, SplitBlock); + + // Clone and place bitcast and return instructions before `TI` + if (IsMustTailCall) + copyMustTailReturn(SplitBlock, Instr, NewCI); } - // Clone and place bitcast and return instructions before `TI` - if (IsMustTailCall) { - copyMustTailReturn(SplitBlock1, CS.getInstruction(), CallInst1); - copyMustTailReturn(SplitBlock2, CS.getInstruction(), CallInst2); - } - - // Replace users of the original call with a PHI mering call-sites split. - if (!IsMustTailCall && Instr->getNumUses()) { - PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", - TailBB->getFirstNonPHI()); - PN->addIncoming(CallInst1, SplitBlock1); - PN->addIncoming(CallInst2, SplitBlock2); - Instr->replaceAllUsesWith(PN); - } - DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); - DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() - << "\n"); - DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() - << "\n"); NumCallSiteSplit++; @@ -354,7 +370,41 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, TailBB->eraseFromParent(); return; } - Instr->eraseFromParent(); + + auto *OriginalBegin = &*TailBB->begin(); + // Replace users of the original call with a PHI mering call-sites split. + if (CallPN) { + CallPN->insertBefore(OriginalBegin); + Instr->replaceAllUsesWith(CallPN); + } + + // Remove instructions moved to split blocks from TailBB, from the duplicated + // call instruction to the beginning of the basic block. If an instruction + // has any uses, add a new PHI node to combine the values coming from the + // split blocks. The new PHI nodes are placed before the first original + // instruction, so we do not end up deleting them. By using reverse-order, we + // do not introduce unnecessary PHI nodes for def-use chains from the call + // instruction to the beginning of the block. + auto I = Instr->getReverseIterator(); + while (I != TailBB->rend()) { + Instruction *CurrentI = &*I++; + if (!CurrentI->use_empty()) { + // If an existing PHI has users after the call, there is no need to create + // a new one. + if (isa<PHINode>(CurrentI)) + continue; + PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size()); + for (auto &Mapping : ValueToValueMaps) + NewPN->addIncoming(Mapping[CurrentI], + cast<Instruction>(Mapping[CurrentI])->getParent()); + NewPN->insertBefore(&*TailBB->begin()); + CurrentI->replaceAllUsesWith(NewPN); + } + CurrentI->eraseFromParent(); + // We are done once we handled the first original instruction in TailBB. + if (CurrentI == OriginalBegin) + break; + } } // Return true if the call-site has an argument which is a PHI with only @@ -385,45 +435,59 @@ static bool isPredicatedOnPHI(CallSite CS) { return false; } -static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) { +static bool tryToSplitOnPHIPredicatedArgument(CallSite CS, DominatorTree *DT) { if (!isPredicatedOnPHI(CS)) return false; auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); - splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr); + SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS = { + {Preds[0], {}}, {Preds[1], {}}}; + splitCallSite(CS, PredsCS, DT); return true; } -static bool tryToSplitOnPredicatedArgument(CallSite CS) { +static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) { auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); if (Preds[0] == Preds[1]) return false; - SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2; - recordConditions(CS, Preds[0], C1); - recordConditions(CS, Preds[1], C2); + SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS; + for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) { + ConditionsTy Conditions; + recordConditions(CS, Pred, Conditions); + PredsCS.push_back({Pred, Conditions}); + } - Instruction *CallInst1 = addConditions(CS, C1); - Instruction *CallInst2 = addConditions(CS, C2); - if (!CallInst1 && !CallInst2) + if (std::all_of(PredsCS.begin(), PredsCS.end(), + [](const std::pair<BasicBlock *, ConditionsTy> &P) { + return P.second.empty(); + })) return false; - splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1); + splitCallSite(CS, PredsCS, DT); return true; } -static bool tryToSplitCallSite(CallSite CS) { - if (!CS.arg_size() || !canSplitCallSite(CS)) +static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI, + DominatorTree *DT) { + if (!CS.arg_size() || !canSplitCallSite(CS, TTI)) return false; - return tryToSplitOnPredicatedArgument(CS) || - tryToSplitOnPHIPredicatedArgument(CS); + return tryToSplitOnPredicatedArgument(CS, DT) || + tryToSplitOnPHIPredicatedArgument(CS, DT); } -static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI, + TargetTransformInfo &TTI, DominatorTree *DT) { bool Changed = false; for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { BasicBlock &BB = *BI++; - for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + auto II = BB.getFirstNonPHIOrDbg()->getIterator(); + auto IE = BB.getTerminator()->getIterator(); + // Iterate until we reach the terminator instruction. tryToSplitCallSite + // can replace BB's terminator in case BB is a successor of itself. In that + // case, IE will be invalidated and we also have to check the current + // terminator. + while (II != IE && &*II != BB.getTerminator()) { Instruction *I = &*II++; CallSite CS(cast<Value>(I)); if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI)) @@ -437,7 +501,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { // Check if such path is possible before attempting the splitting. bool IsMustTail = CS.isMustTailCall(); - Changed |= tryToSplitCallSite(CS); + Changed |= tryToSplitCallSite(CS, TTI, DT); // There're no interesting instructions after this. The call site // itself might have been erased on splitting. @@ -457,6 +521,8 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } @@ -465,7 +531,10 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { return false; auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - return doCallSiteSplitting(F, TLI); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + return doCallSiteSplitting(F, TLI, TTI, + DTWP ? &DTWP->getDomTree() : nullptr); } }; } // namespace @@ -474,6 +543,7 @@ char CallSiteSplittingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", "Call-site splitting", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", "Call-site splitting", false, false) FunctionPass *llvm::createCallSiteSplittingPass() { @@ -483,9 +553,12 @@ FunctionPass *llvm::createCallSiteSplittingPass() { PreservedAnalyses CallSiteSplittingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); - if (!doCallSiteSplitting(F, TLI)) + if (!doCallSiteSplitting(F, TLI, TTI, DT)) return PreservedAnalyses::all(); PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index e4b08c5ed305..3a675b979017 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -43,8 +43,10 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" @@ -59,8 +61,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/IR/DebugInfoMetadata.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -84,7 +84,7 @@ static cl::opt<bool> ConstHoistWithBlockFrequency( namespace { -/// \brief The constant hoisting pass. +/// The constant hoisting pass. class ConstantHoistingLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid @@ -127,13 +127,13 @@ FunctionPass *llvm::createConstantHoistingPass() { return new ConstantHoistingLegacyPass(); } -/// \brief Perform the constant hoisting optimization for the given function. +/// Perform the constant hoisting optimization for the given function. bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { if (skipFunction(Fn)) return false; - DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); - DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); + LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); bool MadeChange = Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn), @@ -144,16 +144,16 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { Fn.getEntryBlock()); if (MadeChange) { - DEBUG(dbgs() << "********** Function after Constant Hoisting: " - << Fn.getName() << '\n'); - DEBUG(dbgs() << Fn); + LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: " + << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << Fn); } - DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); + LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); return MadeChange; } -/// \brief Find the constant materialization insertion point. +/// Find the constant materialization insertion point. Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, unsigned Idx) const { // If the operand is a cast instruction, then we have to materialize the @@ -187,7 +187,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, return IDom->getBlock()->getTerminator(); } -/// \brief Given \p BBs as input, find another set of BBs which collectively +/// Given \p BBs as input, find another set of BBs which collectively /// dominates \p BBs and have the minimal sum of frequencies. Return the BB /// set found in \p BBs. static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, @@ -289,7 +289,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, } } -/// \brief Find an insertion point that dominates all uses. +/// Find an insertion point that dominates all uses. SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); @@ -335,7 +335,7 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( return InsertPts; } -/// \brief Record constant integer ConstInt for instruction Inst at operand +/// Record constant integer ConstInt for instruction Inst at operand /// index Idx. /// /// The operand at index Idx is not necessarily the constant integer itself. It @@ -364,18 +364,17 @@ void ConstantHoistingPass::collectConstantCandidates( Itr->second = ConstCandVec.size() - 1; } ConstCandVec[Itr->second].addUser(Inst, Idx, Cost); - DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) - dbgs() << "Collect constant " << *ConstInt << " from " << *Inst + LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs() + << "Collect constant " << *ConstInt << " from " << *Inst << " with cost " << Cost << '\n'; - else - dbgs() << "Collect constant " << *ConstInt << " indirectly from " - << *Inst << " via " << *Inst->getOperand(Idx) << " with cost " - << Cost << '\n'; - ); + else dbgs() << "Collect constant " << *ConstInt + << " indirectly from " << *Inst << " via " + << *Inst->getOperand(Idx) << " with cost " << Cost + << '\n';); } } -/// \brief Check the operand for instruction Inst at index Idx. +/// Check the operand for instruction Inst at index Idx. void ConstantHoistingPass::collectConstantCandidates( ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) { Value *Opnd = Inst->getOperand(Idx); @@ -416,7 +415,7 @@ void ConstantHoistingPass::collectConstantCandidates( } } -/// \brief Scan the instruction for expensive integer constants and record them +/// Scan the instruction for expensive integer constants and record them /// in the constant candidate vector. void ConstantHoistingPass::collectConstantCandidates( ConstCandMapType &ConstCandMap, Instruction *Inst) { @@ -436,7 +435,7 @@ void ConstantHoistingPass::collectConstantCandidates( } // end of for all operands } -/// \brief Collect all integer constants in the function that cannot be folded +/// Collect all integer constants in the function that cannot be folded /// into an instruction itself. void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; @@ -501,20 +500,21 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, return NumUses; } - DEBUG(dbgs() << "== Maximize constants in range ==\n"); + LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n"); int MaxCost = -1; for (auto ConstCand = S; ConstCand != E; ++ConstCand) { auto Value = ConstCand->ConstInt->getValue(); Type *Ty = ConstCand->ConstInt->getType(); int Cost = 0; NumUses += ConstCand->Uses.size(); - DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n"); + LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() + << "\n"); for (auto User : ConstCand->Uses) { unsigned Opcode = User.Inst->getOpcode(); unsigned OpndIdx = User.OpndIdx; Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty); - DEBUG(dbgs() << "Cost: " << Cost << "\n"); + LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n"); for (auto C2 = S; C2 != E; ++C2) { Optional<APInt> Diff = calculateOffsetDiff( @@ -524,24 +524,24 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, const int ImmCosts = TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); Cost -= ImmCosts; - DEBUG(dbgs() << "Offset " << Diff.getValue() << " " - << "has penalty: " << ImmCosts << "\n" - << "Adjusted cost: " << Cost << "\n"); + LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " " + << "has penalty: " << ImmCosts << "\n" + << "Adjusted cost: " << Cost << "\n"); } } } - DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n"); + LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n"); if (Cost > MaxCost) { MaxCost = Cost; MaxCostItr = ConstCand; - DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue() - << "\n"); + LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue() + << "\n"); } } return NumUses; } -/// \brief Find the base constant within the given range and rebase all other +/// Find the base constant within the given range and rebase all other /// constants with respect to the base constant. void ConstantHoistingPass::findAndMakeBaseConstant( ConstCandVecType::iterator S, ConstCandVecType::iterator E) { @@ -567,12 +567,12 @@ void ConstantHoistingPass::findAndMakeBaseConstant( ConstantVec.push_back(std::move(ConstInfo)); } -/// \brief Finds and combines constant candidates that can be easily +/// Finds and combines constant candidates that can be easily /// rematerialized with an add from a common base constant. void ConstantHoistingPass::findBaseConstants() { // Sort the constants by value and type. This invalidates the mapping! - std::sort(ConstCandVec.begin(), ConstCandVec.end(), - [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) { + llvm::sort(ConstCandVec.begin(), ConstCandVec.end(), + [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) { if (LHS.ConstInt->getType() != RHS.ConstInt->getType()) return LHS.ConstInt->getType()->getBitWidth() < RHS.ConstInt->getType()->getBitWidth(); @@ -601,7 +601,7 @@ void ConstantHoistingPass::findBaseConstants() { findAndMakeBaseConstant(MinValItr, ConstCandVec.end()); } -/// \brief Updates the operand at Idx in instruction Inst with the result of +/// Updates the operand at Idx in instruction Inst with the result of /// instruction Mat. If the instruction is a PHI node then special /// handling for duplicate values form the same incoming basic block is /// required. @@ -629,7 +629,7 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) { return true; } -/// \brief Emit materialization code for all rebased constants and update their +/// Emit materialization code for all rebased constants and update their /// users. void ConstantHoistingPass::emitBaseConstants(Instruction *Base, Constant *Offset, @@ -641,19 +641,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, Mat = BinaryOperator::Create(Instruction::Add, Base, Offset, "const_mat", InsertionPt); - DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) - << " + " << *Offset << ") in BB " - << Mat->getParent()->getName() << '\n' << *Mat << '\n'); + LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0) + << " + " << *Offset << ") in BB " + << Mat->getParent()->getName() << '\n' + << *Mat << '\n'); Mat->setDebugLoc(ConstUser.Inst->getDebugLoc()); } Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx); // Visit constant integer. if (isa<ConstantInt>(Opnd)) { - DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset) Mat->eraseFromParent(); - DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); return; } @@ -669,13 +670,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, ClonedCastInst->insertAfter(CastInst); // Use the same debug location as the original cast instruction. ClonedCastInst->setDebugLoc(CastInst->getDebugLoc()); - DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n' - << "To : " << *ClonedCastInst << '\n'); + LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n' + << "To : " << *ClonedCastInst << '\n'); } - DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst); - DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); return; } @@ -689,20 +690,20 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, // Use the same debug location as the instruction we are about to update. ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc()); - DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n' - << "From : " << *ConstExpr << '\n'); - DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n' + << "From : " << *ConstExpr << '\n'); + LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n'); if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) { ConstExprInst->eraseFromParent(); if (Offset) Mat->eraseFromParent(); } - DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); + LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n'); return; } } -/// \brief Hoist and hide the base constant behind a bitcast and emit +/// Hoist and hide the base constant behind a bitcast and emit /// materialization code for derived constants. bool ConstantHoistingPass::emitBaseConstants() { bool MadeChange = false; @@ -720,9 +721,9 @@ bool ConstantHoistingPass::emitBaseConstants() { Base->setDebugLoc(IP->getDebugLoc()); - DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant - << ") to BB " << IP->getParent()->getName() << '\n' - << *Base << '\n'); + LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant + << ") to BB " << IP->getParent()->getName() << '\n' + << *Base << '\n'); // Emit materialization code for all rebased constants. unsigned Uses = 0; @@ -765,7 +766,7 @@ bool ConstantHoistingPass::emitBaseConstants() { return MadeChange; } -/// \brief Check all cast instructions we made a copy of and remove them if they +/// Check all cast instructions we made a copy of and remove them if they /// have no more users. void ConstantHoistingPass::deleteDeadCastInst() const { for (auto const &I : ClonedCastMap) @@ -773,7 +774,7 @@ void ConstantHoistingPass::deleteDeadCastInst() const { I.first->eraseFromParent(); } -/// \brief Optimize expensive integer constants in the given function. +/// Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, DominatorTree &DT, BlockFrequencyInfo *BFI, BasicBlock &Entry) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index 4fa27891a974..46915889ce7c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -21,12 +21,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <set> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 8f468ebf8949..ea148b728a10 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -28,11 +29,11 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" @@ -43,7 +44,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <utility> @@ -52,12 +52,14 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" STATISTIC(NumPhis, "Number of phis propagated"); +STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); STATISTIC(NumCmps, "Number of comparisons propagated"); STATISTIC(NumReturns, "Number of return values propagated"); STATISTIC(NumDeadCases, "Number of switch cases removed"); STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); +STATISTIC(NumUDivs, "Number of udivs whose width was decreased"); STATISTIC(NumAShrs, "Number of ashr converted to lshr"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumOverflows, "Number of overflow checks removed"); @@ -77,8 +79,10 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); } }; @@ -88,6 +92,7 @@ char CorrelatedValuePropagation::ID = 0; INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", "Value Propagation", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", "Value Propagation", false, false) @@ -101,14 +106,14 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { if (S->getType()->isVectorTy()) return false; if (isa<Constant>(S->getOperand(0))) return false; - Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S); + Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S); if (!C) return false; ConstantInt *CI = dyn_cast<ConstantInt>(C); if (!CI) return false; - Value *ReplaceWith = S->getOperand(1); - Value *Other = S->getOperand(2); + Value *ReplaceWith = S->getTrueValue(); + Value *Other = S->getFalseValue(); if (!CI->isOne()) std::swap(ReplaceWith, Other); if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType()); @@ -120,7 +125,63 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { return true; } -static bool processPHI(PHINode *P, LazyValueInfo *LVI, +/// Try to simplify a phi with constant incoming values that match the edge +/// values of a non-constant value on all other edges: +/// bb0: +/// %isnull = icmp eq i8* %x, null +/// br i1 %isnull, label %bb2, label %bb1 +/// bb1: +/// br label %bb2 +/// bb2: +/// %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ] +/// --> +/// %r = %x +static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, + DominatorTree *DT) { + // Collect incoming constants and initialize possible common value. + SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants; + Value *CommonValue = nullptr; + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = P->getIncomingValue(i); + if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) { + IncomingConstants.push_back(std::make_pair(IncomingConstant, i)); + } else if (!CommonValue) { + // The potential common value is initialized to the first non-constant. + CommonValue = Incoming; + } else if (Incoming != CommonValue) { + // There can be only one non-constant common value. + return false; + } + } + + if (!CommonValue || IncomingConstants.empty()) + return false; + + // The common value must be valid in all incoming blocks. + BasicBlock *ToBB = P->getParent(); + if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) + if (!DT->dominates(CommonInst, ToBB)) + return false; + + // We have a phi with exactly 1 variable incoming value and 1 or more constant + // incoming values. See if all constant incoming values can be mapped back to + // the same incoming variable value. + for (auto &IncomingConstant : IncomingConstants) { + Constant *C = IncomingConstant.first; + BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second); + if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P)) + return false; + } + + // All constant incoming values map to the same variable along the incoming + // edges of the phi. The phi is unnecessary. + P->replaceAllUsesWith(CommonValue); + P->eraseFromParent(); + ++NumPhiCommon; + return true; +} + +static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, const SimplifyQuery &SQ) { bool Changed = false; @@ -168,7 +229,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, V = SI->getTrueValue(); } - DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n'); + LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n'); } P->setIncomingValue(i, V); @@ -181,6 +242,9 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, Changed = true; } + if (!Changed) + Changed = simplifyCommonValuePhi(P, LVI, DT); + if (Changed) ++NumPhis; @@ -243,7 +307,7 @@ static bool processCmp(CmpInst *C, LazyValueInfo *LVI) { /// that cannot fire no matter what the incoming edge can safely be removed. If /// a case fires on every incoming edge then the entire switch can be removed /// and replaced with a branch to the case destination. -static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { +static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI, DominatorTree *DT) { Value *Cond = SI->getCondition(); BasicBlock *BB = SI->getParent(); @@ -258,6 +322,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { // Analyse each switch case in turn. bool Changed = false; + DenseMap<BasicBlock*, int> SuccessorsCount; + for (auto *Succ : successors(BB)) + SuccessorsCount[Succ]++; + for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); @@ -292,7 +360,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { if (State == LazyValueInfo::False) { // This case never fires - remove it. - CI->getCaseSuccessor()->removePredecessor(BB); + BasicBlock *Succ = CI->getCaseSuccessor(); + Succ->removePredecessor(BB); CI = SI->removeCase(CI); CE = SI->case_end(); @@ -302,6 +371,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { ++NumDeadCases; Changed = true; + if (--SuccessorsCount[Succ] == 0) + DT->deleteEdge(BB, Succ); continue; } if (State == LazyValueInfo::True) { @@ -318,10 +389,14 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { ++CI; } - if (Changed) + if (Changed) { // If the switch has been simplified to the point where it can be replaced // by a branch then do so now. - ConstantFoldTerminator(BB); + DeferredDominance DDT(*DT); + ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false, + /*TLI = */ nullptr, &DDT); + DDT.flush(); + } return Changed; } @@ -430,9 +505,50 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) { return true; } +/// Try to shrink a udiv/urem's width down to the smallest power of two that's +/// sufficient to contain its operands. +static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) { + assert(Instr->getOpcode() == Instruction::UDiv || + Instr->getOpcode() == Instruction::URem); + if (Instr->getType()->isVectorTy()) + return false; + + // Find the smallest power of two bitwidth that's sufficient to hold Instr's + // operands. + auto OrigWidth = Instr->getType()->getIntegerBitWidth(); + ConstantRange OperandRange(OrigWidth, /*isFullset=*/false); + for (Value *Operand : Instr->operands()) { + OperandRange = OperandRange.unionWith( + LVI->getConstantRange(Operand, Instr->getParent())); + } + // Don't shrink below 8 bits wide. + unsigned NewWidth = std::max<unsigned>( + PowerOf2Ceil(OperandRange.getUnsignedMax().getActiveBits()), 8); + // NewWidth might be greater than OrigWidth if OrigWidth is not a power of + // two. + if (NewWidth >= OrigWidth) + return false; + + ++NumUDivs; + auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth); + auto *LHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(0), TruncTy, + Instr->getName() + ".lhs.trunc", Instr); + auto *RHS = CastInst::Create(Instruction::Trunc, Instr->getOperand(1), TruncTy, + Instr->getName() + ".rhs.trunc", Instr); + auto *BO = + BinaryOperator::Create(Instr->getOpcode(), LHS, RHS, Instr->getName(), Instr); + auto *Zext = CastInst::Create(Instruction::ZExt, BO, Instr->getType(), + Instr->getName() + ".zext", Instr); + if (BO->getOpcode() == Instruction::UDiv) + BO->setIsExact(Instr->isExact()); + + Instr->replaceAllUsesWith(Zext); + Instr->eraseFromParent(); + return true; +} + static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy() || - !hasPositiveOperands(SDI, LVI)) + if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI)) return false; ++NumSRems; @@ -440,6 +556,10 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { SDI->getName(), SDI); SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); + + // Try to process our new urem. + processUDivOrURem(BO, LVI); + return true; } @@ -449,8 +569,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { /// conditions, this can sometimes prove conditions instcombine can't by /// exploiting range information. static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { - if (SDI->getType()->isVectorTy() || - !hasPositiveOperands(SDI, LVI)) + if (SDI->getType()->isVectorTy() || !hasPositiveOperands(SDI, LVI)) return false; ++NumSDivs; @@ -460,6 +579,9 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); + // Try to simplify our new udiv. + processUDivOrURem(BO, LVI); + return true; } @@ -559,7 +681,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { ConstantInt::getFalse(C->getContext()); } -static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { +static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, + const SimplifyQuery &SQ) { bool FnChanged = false; // Visiting in a pre-order depth-first traversal causes us to simplify early // blocks before querying later blocks (which require us to analyze early @@ -575,7 +698,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { BBChanged |= processSelect(cast<SelectInst>(II), LVI); break; case Instruction::PHI: - BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ); + BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ); break; case Instruction::ICmp: case Instruction::FCmp: @@ -595,6 +718,10 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { case Instruction::SDiv: BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI); break; + case Instruction::UDiv: + case Instruction::URem: + BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI); + break; case Instruction::AShr: BBChanged |= processAShr(cast<BinaryOperator>(II), LVI); break; @@ -607,7 +734,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { Instruction *Term = BB->getTerminator(); switch (Term->getOpcode()) { case Instruction::Switch: - BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI); + BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT); break; case Instruction::Ret: { auto *RI = cast<ReturnInst>(Term); @@ -636,18 +763,22 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { return false; LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); - return runImpl(F, LVI, getBestSimplifyQuery(*this, F)); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F)); } PreservedAnalyses CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { - LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F); - bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F)); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + + bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F)); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<GlobalsAA>(); + PA.preserve<DominatorTreeAnalysis>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index fa4806e884c3..6078967a0f94 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -20,11 +20,11 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "dce" @@ -50,6 +50,7 @@ namespace { for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { + salvageDebugInfo(*Inst); Inst->eraseFromParent(); Changed = true; ++DIEEliminated; @@ -76,6 +77,8 @@ static bool DCEInstruction(Instruction *I, SmallSetVector<Instruction *, 16> &WorkList, const TargetLibraryInfo *TLI) { if (isInstructionTriviallyDead(I, TLI)) { + salvageDebugInfo(*I); + // Null out all of the instruction's operands to see if any operand becomes // dead as we go. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index b665d94a70aa..dd1a2a6adb82 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -56,11 +57,10 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> -#include <cstdint> #include <cstddef> +#include <cstdint> #include <iterator> #include <map> #include <utility> @@ -115,6 +115,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, Instruction *DeadInst = NowDeadInsts.pop_back_val(); ++NumFastOther; + // Try to preserve debug information attached to the dead instruction. + salvageDebugInfo(*DeadInst); + // This instruction is dead, zap it, in stages. Start by removing it from // MemDep, which needs to know the operands and needs it to be in the // function. @@ -146,7 +149,8 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, /// Does this instruction write some memory? This only returns true for things /// that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { +static bool hasAnalyzableMemoryWrite(Instruction *I, + const TargetLibraryInfo &TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -156,6 +160,9 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memmove_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: case Intrinsic::init_trampoline: case Intrinsic::lifetime_end: return true; @@ -180,43 +187,45 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { /// Return a Location stored to by the specified instruction. If isRemovable /// returns true, this function and getLocForRead completely describe the memory /// operations for this instruction. -static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { +static MemoryLocation getLocForWrite(Instruction *Inst) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return MemoryLocation::get(SI); - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { + if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) { // memcpy/memmove/memset. MemoryLocation Loc = MemoryLocation::getForDest(MI); return Loc; } - IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); - if (!II) - return MemoryLocation(); - - switch (II->getIntrinsicID()) { - default: - return MemoryLocation(); // Unhandled intrinsic. - case Intrinsic::init_trampoline: - // FIXME: We don't know the size of the trampoline, so we can't really - // handle it here. - return MemoryLocation(II->getArgOperand(0)); - case Intrinsic::lifetime_end: { - uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); - return MemoryLocation(II->getArgOperand(1), Len); - } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + switch (II->getIntrinsicID()) { + default: + return MemoryLocation(); // Unhandled intrinsic. + case Intrinsic::init_trampoline: + return MemoryLocation(II->getArgOperand(0)); + case Intrinsic::lifetime_end: { + uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); + return MemoryLocation(II->getArgOperand(1), Len); + } + } } + if (auto CS = CallSite(Inst)) + // All the supported TLI functions so far happen to have dest as their + // first argument. + return MemoryLocation(CS.getArgument(0)); + return MemoryLocation(); } -/// Return the location read by the specified "hasMemoryWrite" instruction if -/// any. +/// Return the location read by the specified "hasAnalyzableMemoryWrite" +/// instruction if any. static MemoryLocation getLocForRead(Instruction *Inst, const TargetLibraryInfo &TLI) { - assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case"); + assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) + if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst)) return MemoryLocation::getForSource(MTI); return MemoryLocation(); } @@ -230,7 +239,7 @@ static bool isRemovable(Instruction *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); + default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate"); case Intrinsic::lifetime_end: // Never remove dead lifetime_end's, e.g. because it is followed by a // free. @@ -243,9 +252,14 @@ static bool isRemovable(Instruction *I) { case Intrinsic::memcpy: // Don't remove volatile memory intrinsics. return !cast<MemIntrinsic>(II)->isVolatile(); + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memmove_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: + return true; } } + // note: only get here for calls with analyzable writes - i.e. libcalls if (auto CS = CallSite(I)) return CS.getInstruction()->use_empty(); @@ -264,6 +278,8 @@ static bool isShortenableAtTheEnd(Instruction *I) { default: return false; case Intrinsic::memset: case Intrinsic::memcpy: + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: // Do shorten memory intrinsics. // FIXME: Add memmove if it's also safe to transform. return true; @@ -280,35 +296,27 @@ static bool isShortenableAtTheEnd(Instruction *I) { static bool isShortenableAtTheBeginning(Instruction *I) { // FIXME: Handle only memset for now. Supporting memcpy/memmove should be // easily done by offsetting the source address. - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - return II && II->getIntrinsicID() == Intrinsic::memset; + return isa<AnyMemSetInst>(I); } /// Return the pointer that is being written to. static Value *getStoredPointerOperand(Instruction *I) { - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->getPointerOperand(); - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) - return MI->getDest(); - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::init_trampoline: - return II->getArgOperand(0); - } - } - - CallSite CS(I); - // All the supported functions so far happen to have dest as their first - // argument. - return CS.getArgument(0); + //TODO: factor this to reuse getLocForWrite + MemoryLocation Loc = getLocForWrite(I); + assert(Loc.Ptr && + "unable to find pointer written for analyzable instruction?"); + // TODO: most APIs don't expect const Value * + return const_cast<Value*>(Loc.Ptr); } static uint64_t getPointerSize(const Value *V, const DataLayout &DL, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, + const Function *F) { uint64_t Size; - if (getObjectSize(V, Size, DL, &TLI)) + ObjectSizeOpts Opts; + Opts.NullIsUnknownSize = NullPointerIsDefined(F); + + if (getObjectSize(V, Size, DL, &TLI, Opts)) return Size; return MemoryLocation::UnknownSize; } @@ -338,7 +346,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff, Instruction *DepWrite, - InstOverlapIntervalsTy &IOL) { + InstOverlapIntervalsTy &IOL, + AliasAnalysis &AA, + const Function *F) { // If we don't know the sizes of either access, then we can't do a comparison. if (Later.Size == MemoryLocation::UnknownSize || Earlier.Size == MemoryLocation::UnknownSize) @@ -349,7 +359,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // If the start pointers are the same, we just have to compare sizes to see if // the later store was larger than the earlier store. - if (P1 == P2) { + if (P1 == P2 || AA.isMustAlias(P1, P2)) { // Make sure that the Later size is >= the Earlier size. if (Later.Size >= Earlier.Size) return OW_Complete; @@ -367,7 +377,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, return OW_Unknown; // If the "Later" store is to a recognizable object, get its size. - uint64_t ObjectSize = getPointerSize(UO2, DL, TLI); + uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F); if (ObjectSize != MemoryLocation::UnknownSize) if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size) return OW_Complete; @@ -415,9 +425,10 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // Insert our part of the overlap into the map. auto &IM = IOL[DepWrite]; - DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " << - int64_t(EarlierOff + Earlier.Size) << ") Later [" << - LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n"); + LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff + << ", " << int64_t(EarlierOff + Earlier.Size) + << ") Later [" << LaterOff << ", " + << int64_t(LaterOff + Later.Size) << ")\n"); // Make sure that we only insert non-overlapping intervals and combine // adjacent intervals. The intervals are stored in the map with the ending @@ -454,11 +465,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, ILI = IM.begin(); if (ILI->second <= EarlierOff && ILI->first >= int64_t(EarlierOff + Earlier.Size)) { - DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" << - EarlierOff << ", " << - int64_t(EarlierOff + Earlier.Size) << - ") Composite Later [" << - ILI->second << ", " << ILI->first << ")\n"); + LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" + << EarlierOff << ", " + << int64_t(EarlierOff + Earlier.Size) + << ") Composite Later [" << ILI->second << ", " + << ILI->first << ")\n"); ++NumCompletePartials; return OW_Complete; } @@ -469,10 +480,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, if (EnablePartialStoreMerging && LaterOff >= EarlierOff && int64_t(EarlierOff + Earlier.Size) > LaterOff && uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) { - DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff - << ", " << int64_t(EarlierOff + Earlier.Size) - << ") by a later store [" << LaterOff << ", " - << int64_t(LaterOff + Later.Size) << ")\n"); + LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" + << EarlierOff << ", " + << int64_t(EarlierOff + Earlier.Size) + << ") by a later store [" << LaterOff << ", " + << int64_t(LaterOff + Later.Size) << ")\n"); // TODO: Maybe come up with a better name? return OW_PartialEarlierWithFullLater; } @@ -514,8 +526,8 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, /// memory region into an identical pointer) then it doesn't actually make its /// input dead in the traditional sense. Consider this case: /// -/// memcpy(A <- B) -/// memcpy(A <- A) +/// memmove(A <- B) +/// memmove(A <- A) /// /// In this case, the second store to A does not make the first store to A dead. /// The usual situation isn't an explicit A<-A store like this (which can be @@ -531,24 +543,35 @@ static bool isPossibleSelfRead(Instruction *Inst, // Self reads can only happen for instructions that read memory. Get the // location read. MemoryLocation InstReadLoc = getLocForRead(Inst, TLI); - if (!InstReadLoc.Ptr) return false; // Not a reading instruction. + if (!InstReadLoc.Ptr) + return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. - if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; - - // Okay, 'Inst' may copy over itself. However, we can still remove a the - // DepWrite instruction if we can prove that it reads from the same location - // as Inst. This handles useful cases like: - // memcpy(A <- B) - // memcpy(A <- B) - // Here we don't know if A/B may alias, but we do know that B/B are must - // aliases, so removing the first memcpy is safe (assuming it writes <= # - // bytes as the second one. - MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); - - if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) + if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; + if (isa<AnyMemCpyInst>(Inst)) { + // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763) + // but in practice memcpy(A <- B) either means that A and B are disjoint or + // are equal (i.e. there are not partial overlaps). Given that, if we have: + // + // memcpy/memmove(A <- B) // DepWrite + // memcpy(A <- B) // Inst + // + // with Inst reading/writing a >= size than DepWrite, we can reason as + // follows: + // + // - If A == B then both the copies are no-ops, so the DepWrite can be + // removed. + // - If A != B then A and B are disjoint locations in Inst. Since + // Inst.size >= DepWrite.size A and B are disjoint in DepWrite too. + // Therefore DepWrite can be removed. + MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); + + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) + return false; + } + // If DepWrite doesn't read memory or if we can't prove it is a must alias, // then it can't be considered dead. return true; @@ -650,7 +673,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) + if (!hasAnalyzableMemoryWrite(Dependency, *TLI) || + !isRemovable(Dependency)) break; Value *DepPointer = @@ -660,8 +684,9 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - DEBUG(dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: " - << *Dependency << '\n'); + LLVM_DEBUG( + dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: " + << *Dependency << '\n'); // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); @@ -690,7 +715,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, static void removeAccessedObjects(const MemoryLocation &LoadedLoc, SmallSetVector<Value *, 16> &DeadStackObjects, const DataLayout &DL, AliasAnalysis *AA, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + const Function *F) { const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); // A constant can't be in the dead pointer set. @@ -707,7 +733,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); + MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F)); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } @@ -754,7 +780,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { + if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL); @@ -770,15 +796,16 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (AllDead) { Instruction *Dead = &*BBI; - DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " - << *Dead << "\n Objects: "; - for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), - E = Pointers.end(); I != E; ++I) { - dbgs() << **I; - if (std::next(I) != E) - dbgs() << ", "; - } - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " + << *Dead << "\n Objects: "; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); + I != E; ++I) { + dbgs() << **I; + if (std::next(I) != E) + dbgs() << ", "; + } dbgs() + << '\n'); // DCE instructions only used to calculate that store. deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects); @@ -790,8 +817,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, // Remove any dead non-memory-mutating instructions. if (isInstructionTriviallyDead(&*BBI, TLI)) { - DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: " - << *&*BBI << '\n'); + LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: " + << *&*BBI << '\n'); deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, InstrOrdering, &DeadStackObjects); ++NumFastOther; MadeChange = true; @@ -820,7 +847,8 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI))); + return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI, + BB.getParent()))); }); // If all of the allocas were clobbered by the call then we're not going @@ -848,8 +876,6 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, LoadedLoc = MemoryLocation::get(L); } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { LoadedLoc = MemoryLocation::get(V); - } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { - LoadedLoc = MemoryLocation::getForSource(MTI); } else if (!BBI->mayReadFromMemory()) { // Instruction doesn't read memory. Note that stores that weren't removed // above will hit this case. @@ -861,7 +887,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, // Remove any allocas from the DeadPointer set that are loaded, as this // makes any stores above the access live. - removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI); + removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent()); // If all of the allocas were clobbered by the access then we're not going // to find anything else to process. @@ -881,8 +907,8 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, // Power of 2 vector writes are probably always a bad idea to optimize // as any store/memset/memcpy is likely using vector instructions so // shortening it to not vector size is likely to be slower - MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite); - unsigned EarlierWriteAlign = EarlierIntrinsic->getAlignment(); + auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite); + unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment(); if (!IsOverwriteEnd) LaterOffset = int64_t(LaterOffset + LaterSize); @@ -890,15 +916,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) return false; - DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " - << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite - << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize - << ")\n"); - int64_t NewLength = IsOverwriteEnd ? LaterOffset - EarlierOffset : EarlierSize - (LaterOffset - EarlierOffset); + if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) { + // When shortening an atomic memory intrinsic, the newly shortened + // length must remain an integer multiple of the element size. + const uint32_t ElementSize = AMI->getElementSizeInBytes(); + if (0 != NewLength % ElementSize) + return false; + } + + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " + << (IsOverwriteEnd ? "END" : "BEGIN") << ": " + << *EarlierWrite << "\n KILLER (offset " << LaterOffset + << ", " << EarlierSize << ")\n"); + Value *EarlierWriteLength = EarlierIntrinsic->getLength(); Value *TrimmedLength = ConstantInt::get(EarlierWriteLength->getType(), NewLength); @@ -966,7 +1000,7 @@ static bool removePartiallyOverlappedStores(AliasAnalysis *AA, bool Changed = false; for (auto OI : IOL) { Instruction *EarlierWrite = OI.first; - MemoryLocation Loc = getLocForWrite(EarlierWrite, *AA); + MemoryLocation Loc = getLocForWrite(EarlierWrite); assert(isRemovable(EarlierWrite) && "Expect only removable instruction"); assert(Loc.Size != MemoryLocation::UnknownSize && "Unexpected mem loc"); @@ -1002,8 +1036,9 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, if (SI->getPointerOperand() == DepLoad->getPointerOperand() && isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) { - DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " - << *DepLoad << "\n STORE: " << *SI << '\n'); + LLVM_DEBUG( + dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " + << *DepLoad << "\n STORE: " << *SI << '\n'); deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, InstrOrdering); ++NumRedundantStores; @@ -1019,7 +1054,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) { - DEBUG( + LLVM_DEBUG( dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: " << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); @@ -1067,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, } // Check to see if Inst writes to memory. If not, continue. - if (!hasMemoryWrite(Inst, *TLI)) + if (!hasAnalyzableMemoryWrite(Inst, *TLI)) continue; // eliminateNoopStore will update in iterator, if necessary. @@ -1085,7 +1120,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, continue; // Figure out what location is being stored to. - MemoryLocation Loc = getLocForWrite(Inst, *AA); + MemoryLocation Loc = getLocForWrite(Inst); // If we didn't get a useful location, fail. if (!Loc.Ptr) @@ -1107,7 +1142,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // // Find out what memory location the dependent instruction stores. Instruction *DepWrite = InstDep.getInst(); - MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA); + if (!hasAnalyzableMemoryWrite(DepWrite, *TLI)) + break; + MemoryLocation DepLoc = getLocForWrite(DepWrite); // If we didn't get a useful location, or if it isn't a size, bail out. if (!DepLoc.Ptr) break; @@ -1145,12 +1182,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset, - DepWrite, IOL); + OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, + InstWriteOffset, DepWrite, IOL, *AA, + BB.getParent()); if (OR == OW_Complete) { - DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " - << *DepWrite << "\n KILLER: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite + << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, &InstrOrdering); @@ -1208,9 +1245,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // store, shifted appropriately. APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); - DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite - << "\n Later: " << *Inst - << "\n Merged Value: " << Merged << '\n'); + LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite + << "\n Later: " << *Inst + << "\n Merged Value: " << Merged << '\n'); auto *SI = new StoreInst( ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 5798e1c4ee99..565745d12e99 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -49,10 +50,10 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <deque> #include <memory> @@ -70,13 +71,16 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); +DEBUG_COUNTER(CSECounter, "early-cse", + "Controls which instructions are removed"); + //===----------------------------------------------------------------------===// // SimpleValue //===----------------------------------------------------------------------===// namespace { -/// \brief Struct representing the available values in the scoped hash table. +/// Struct representing the available values in the scoped hash table. struct SimpleValue { Instruction *Inst; @@ -151,12 +155,15 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor; // TODO: We should also detect FP min/max. if (SPF == SPF_SMIN || SPF == SPF_SMAX || - SPF == SPF_UMIN || SPF == SPF_UMAX || - SPF == SPF_ABS || SPF == SPF_NABS) { + SPF == SPF_UMIN || SPF == SPF_UMAX) { if (A > B) std::swap(A, B); return hash_combine(Inst->getOpcode(), SPF, A, B); } + if (SPF == SPF_ABS || SPF == SPF_NABS) { + // ABS/NABS always puts the input in A and its negation in B. + return hash_combine(Inst->getOpcode(), SPF, A, B); + } if (CastInst *CI = dyn_cast<CastInst>(Inst)) return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); @@ -226,8 +233,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { LSPF == SPF_ABS || LSPF == SPF_NABS) { Value *RHSA, *RHSB; SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor; - return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) || - (LHSA == RHSB && LHSB == RHSA))); + if (LSPF == RSPF) { + // Abs results are placed in a defined order by matchSelectPattern. + if (LSPF == SPF_ABS || LSPF == SPF_NABS) + return LHSA == RHSA && LHSB == RHSB; + return ((LHSA == RHSA && LHSB == RHSB) || + (LHSA == RHSB && LHSB == RHSA)); + } } return false; @@ -239,7 +251,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { namespace { -/// \brief Struct representing the available call values in the scoped hash +/// Struct representing the available call values in the scoped hash /// table. struct CallValue { Instruction *Inst; @@ -305,7 +317,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { namespace { -/// \brief A simple and fast domtree-based CSE pass. +/// A simple and fast domtree-based CSE pass. /// /// This pass does a simple depth-first walk over the dominator tree, /// eliminating trivially redundant instructions and using instsimplify to @@ -329,7 +341,7 @@ public: ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, AllocatorTy>; - /// \brief A scoped hash table of the current values of all of our simple + /// A scoped hash table of the current values of all of our simple /// scalar expressions. /// /// As we walk down the domtree, we look to see if instructions are in this: @@ -337,8 +349,8 @@ public: /// that dominated values can succeed in their lookup. ScopedHTType AvailableValues; - /// A scoped hash table of the current values of previously encounted memory - /// locations. + /// A scoped hash table of the current values of previously encountered + /// memory locations. /// /// This allows us to get efficient access to dominating loads or stores when /// we have a fully redundant load. In addition to the most recent load, we @@ -356,13 +368,12 @@ public: unsigned Generation = 0; int MatchingId = -1; bool IsAtomic = false; - bool IsInvariant = false; LoadValue() = default; LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId, - bool IsAtomic, bool IsInvariant) + bool IsAtomic) : DefInst(Inst), Generation(Generation), MatchingId(MatchingId), - IsAtomic(IsAtomic), IsInvariant(IsInvariant) {} + IsAtomic(IsAtomic) {} }; using LoadMapAllocator = @@ -373,8 +384,19 @@ public: LoadMapAllocator>; LoadHTType AvailableLoads; + + // A scoped hash table mapping memory locations (represented as typed + // addresses) to generation numbers at which that memory location became + // (henceforth indefinitely) invariant. + using InvariantMapAllocator = + RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<MemoryLocation, unsigned>>; + using InvariantHTType = + ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>, + InvariantMapAllocator>; + InvariantHTType AvailableInvariants; - /// \brief A scoped hash table of the current values of read-only call + /// A scoped hash table of the current values of read-only call /// values. /// /// It uses the same generation count as loads. @@ -382,10 +404,10 @@ public: ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>; CallHTType AvailableCalls; - /// \brief This is the current generation of the memory value. + /// This is the current generation of the memory value. unsigned CurrentGeneration = 0; - /// \brief Set up the EarlyCSE runner for a particular function. + /// Set up the EarlyCSE runner for a particular function. EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) @@ -401,15 +423,16 @@ private: class NodeScope { public: NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls) - : Scope(AvailableValues), LoadScope(AvailableLoads), - CallScope(AvailableCalls) {} + InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls) + : Scope(AvailableValues), LoadScope(AvailableLoads), + InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {} NodeScope(const NodeScope &) = delete; NodeScope &operator=(const NodeScope &) = delete; private: ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; + InvariantHTType::ScopeTy InvariantScope; CallHTType::ScopeTy CallScope; }; @@ -420,10 +443,13 @@ private: class StackNode { public: StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) + InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls, + unsigned cg, DomTreeNode *n, DomTreeNode::iterator child, + DomTreeNode::iterator end) : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), - EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls) + EndIter(end), + Scopes(AvailableValues, AvailableLoads, AvailableInvariants, + AvailableCalls) {} StackNode(const StackNode &) = delete; StackNode &operator=(const StackNode &) = delete; @@ -455,7 +481,7 @@ private: bool Processed = false; }; - /// \brief Wrapper class to handle memory instructions, including loads, + /// Wrapper class to handle memory instructions, including loads, /// stores and intrinsic loads and stores defined by the target. class ParseMemoryInst { public: @@ -532,12 +558,7 @@ private: Value *getPointerOperand() const { if (IsTargetMemInst) return Info.PtrVal; - if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - return LI->getPointerOperand(); - } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - return SI->getPointerOperand(); - } - return nullptr; + return getLoadStorePointerOperand(Inst); } bool mayReadFromMemory() const { @@ -558,6 +579,9 @@ private: bool processNode(DomTreeNode *Node); + bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI, + const BasicBlock *BB, const BasicBlock *Pred); + Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { if (auto *LI = dyn_cast<LoadInst>(Inst)) return LI; @@ -568,6 +592,10 @@ private: ExpectedType); } + /// Return true if the instruction is known to only operate on memory + /// provably invariant in the given "generation". + bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt); + bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, Instruction *EarlierInst, Instruction *LaterInst); @@ -661,6 +689,79 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, return MSSA->dominates(LaterDef, EarlierMA); } +bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) { + // A location loaded from with an invariant_load is assumed to *never* change + // within the visible scope of the compilation. + if (auto *LI = dyn_cast<LoadInst>(I)) + if (LI->getMetadata(LLVMContext::MD_invariant_load)) + return true; + + auto MemLocOpt = MemoryLocation::getOrNone(I); + if (!MemLocOpt) + // "target" intrinsic forms of loads aren't currently known to + // MemoryLocation::get. TODO + return false; + MemoryLocation MemLoc = *MemLocOpt; + if (!AvailableInvariants.count(MemLoc)) + return false; + + // Is the generation at which this became invariant older than the + // current one? + return AvailableInvariants.lookup(MemLoc) <= GenAt; +} + +bool EarlyCSE::handleBranchCondition(Instruction *CondInst, + const BranchInst *BI, const BasicBlock *BB, + const BasicBlock *Pred) { + assert(BI->isConditional() && "Should be a conditional branch!"); + assert(BI->getCondition() == CondInst && "Wrong condition?"); + assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); + auto *TorF = (BI->getSuccessor(0) == BB) + ? ConstantInt::getTrue(BB->getContext()) + : ConstantInt::getFalse(BB->getContext()); + auto MatchBinOp = [](Instruction *I, unsigned Opcode) { + if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(I)) + return BOp->getOpcode() == Opcode; + return false; + }; + // If the condition is AND operation, we can propagate its operands into the + // true branch. If it is OR operation, we can propagate them into the false + // branch. + unsigned PropagateOpcode = + (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or; + + bool MadeChanges = false; + SmallVector<Instruction *, 4> WorkList; + SmallPtrSet<Instruction *, 4> Visited; + WorkList.push_back(CondInst); + while (!WorkList.empty()) { + Instruction *Curr = WorkList.pop_back_val(); + + AvailableValues.insert(Curr, TorF); + LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" + << Curr->getName() << "' as " << *TorF << " in " + << BB->getName() << "\n"); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + } else { + // Replace all dominated uses with the known value. + if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT, + BasicBlockEdge(Pred, BB))) { + NumCSECVP += Count; + MadeChanges = true; + } + } + + if (MatchBinOp(Curr, PropagateOpcode)) + for (auto &Op : cast<BinaryOperator>(Curr)->operands()) + if (Instruction *OPI = dyn_cast<Instruction>(Op)) + if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second) + WorkList.push_back(OPI); + } + + return MadeChanges; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -684,22 +785,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()); if (BI && BI->isConditional()) { auto *CondInst = dyn_cast<Instruction>(BI->getCondition()); - if (CondInst && SimpleValue::canHandle(CondInst)) { - assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); - auto *TorF = (BI->getSuccessor(0) == BB) - ? ConstantInt::getTrue(BB->getContext()) - : ConstantInt::getFalse(BB->getContext()); - AvailableValues.insert(CondInst, TorF); - DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" - << CondInst->getName() << "' as " << *TorF << " in " - << BB->getName() << "\n"); - // Replace all dominated uses with the known value. - if (unsigned Count = replaceDominatedUsesWith( - CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) { - Changed = true; - NumCSECVP += Count; - } - } + if (CondInst && SimpleValue::canHandle(CondInst)) + Changed |= handleBranchCondition(CondInst, BI, BB, Pred); } } @@ -716,7 +803,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { - DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + continue; + } + salvageDebugInfo(*Inst); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; @@ -732,31 +824,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { auto *CondI = dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0)); if (CondI && SimpleValue::canHandle(CondI)) { - DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst + << '\n'); AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); } else - DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n'); continue; } // Skip sideeffect intrinsics, for the same reason as assume intrinsics. if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) { - DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n'); continue; } - // Skip invariant.start intrinsics since they only read memory, and we can - // forward values across it. Also, we dont need to consume the last store - // since the semantics of invariant.start allow us to perform DSE of the - // last store, if there was a store following invariant.start. Consider: + // We can skip all invariant.start intrinsics since they only read memory, + // and we can forward values across it. For invariant starts without + // invariant ends, we can use the fact that the invariantness never ends to + // start a scope in the current generaton which is true for all future + // generations. Also, we dont need to consume the last store since the + // semantics of invariant.start allow us to perform DSE of the last + // store, if there was a store following invariant.start. Consider: // // store 30, i8* p // invariant.start(p) // store 40, i8* p // We can DSE the store to 30, since the store 40 to invariant location p // causes undefined behaviour. - if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) + if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) { + // If there are any uses, the scope might end. + if (!Inst->use_empty()) + continue; + auto *CI = cast<CallInst>(Inst); + MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI); + // Don't start a scope if we already have a better one pushed + if (!AvailableInvariants.count(MemLoc)) + AvailableInvariants.insert(MemLoc, CurrentGeneration); continue; + } if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) { if (auto *CondI = @@ -767,7 +872,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Is the condition known to be true? if (isa<ConstantInt>(KnownCond) && cast<ConstantInt>(KnownCond)->isOne()) { - DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n'); + LLVM_DEBUG(dbgs() + << "EarlyCSE removing guard: " << *Inst << '\n'); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; @@ -792,29 +898,39 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. if (Value *V = SimplifyInstruction(Inst, SQ)) { - DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); - bool Killed = false; - if (!Inst->use_empty()) { - Inst->replaceAllUsesWith(V); - Changed = true; - } - if (isInstructionTriviallyDead(Inst, &TLI)) { - removeMSSA(Inst); - Inst->eraseFromParent(); - Changed = true; - Killed = true; + LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V + << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + } else { + bool Killed = false; + if (!Inst->use_empty()) { + Inst->replaceAllUsesWith(V); + Changed = true; + } + if (isInstructionTriviallyDead(Inst, &TLI)) { + removeMSSA(Inst); + Inst->eraseFromParent(); + Changed = true; + Killed = true; + } + if (Changed) + ++NumSimplify; + if (Killed) + continue; } - if (Changed) - ++NumSimplify; - if (Killed) - continue; } // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. if (Value *V = AvailableValues.lookup(Inst)) { - DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V + << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + continue; + } if (auto *I = dyn_cast<Instruction>(V)) I->andIRFlags(Inst); Inst->replaceAllUsesWith(V); @@ -840,6 +956,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++CurrentGeneration; } + if (MemInst.isInvariantLoad()) { + // If we pass an invariant load, we know that memory location is + // indefinitely constant from the moment of first dereferenceability. + // We conservatively treat the invariant_load as that moment. If we + // pass a invariant load after already establishing a scope, don't + // restart it since we want to preserve the earliest point seen. + auto MemLoc = MemoryLocation::get(Inst); + if (!AvailableInvariants.count(MemLoc)) + AvailableInvariants.insert(MemLoc, CurrentGeneration); + } + // If we have an available version of this load, and if it is the right // generation or the load is known to be from an invariant location, // replace this instruction. @@ -854,13 +981,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. InVal.IsAtomic >= MemInst.isAtomic() && - (InVal.IsInvariant || MemInst.isInvariantLoad() || + (isOperatingOnInvariantMemAt(Inst, InVal.Generation) || isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.DefInst, Inst))) { Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType()); if (Op != nullptr) { - DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.DefInst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst + << " to: " << *InVal.DefInst << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + continue; + } if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); removeMSSA(Inst); @@ -875,7 +1006,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { AvailableLoads.insert( MemInst.getPointerOperand(), LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), - MemInst.isAtomic(), MemInst.isInvariantLoad())); + MemInst.isAtomic())); LastStore = nullptr; continue; } @@ -898,8 +1029,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (InVal.first != nullptr && isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first, Inst)) { - DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst - << " to: " << *InVal.first << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + continue; + } if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); removeMSSA(Inst); @@ -938,8 +1073,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && - isSameMemGeneration(InVal.Generation, CurrentGeneration, - InVal.DefInst, Inst)) { + (isOperatingOnInvariantMemAt(Inst, InVal.Generation) || + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst))) { // It is okay to have a LastStore to a different pointer here if MemorySSA // tells us that the load and store are from the same memory generation. // In that case, LastStore should keep its present value since we're @@ -949,7 +1085,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { MemInst.getPointerOperand() || MSSA) && "can't have an intervening store if not using MemorySSA!"); - DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + continue; + } removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; @@ -980,13 +1120,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { !LastStoreMemInst.isVolatile() && "Violated invariant"); if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { - DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore - << " due to: " << *Inst << '\n'); - removeMSSA(LastStore); - LastStore->eraseFromParent(); - Changed = true; - ++NumDSE; - LastStore = nullptr; + LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore + << " due to: " << *Inst << '\n'); + if (!DebugCounter::shouldExecute(CSECounter)) { + LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); + } else { + removeMSSA(LastStore); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = nullptr; + } } // fallthrough - we can exploit information about this store } @@ -999,7 +1143,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { AvailableLoads.insert( MemInst.getPointerOperand(), LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), - MemInst.isAtomic(), /*IsInvariant=*/false)); + MemInst.isAtomic())); // Remember that this was the last unordered store we saw for DSE. We // don't yet handle DSE on ordered or volatile stores since we don't @@ -1031,8 +1175,9 @@ bool EarlyCSE::run() { // Process the root node. nodesToProcess.push_back(new StackNode( - AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, - DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); + AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls, + CurrentGeneration, DT.getRootNode(), + DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -1056,9 +1201,9 @@ bool EarlyCSE::run() { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - NodeToProcess->childGeneration(), child, child->begin(), - child->end())); + new StackNode(AvailableValues, AvailableLoads, AvailableInvariants, + AvailableCalls, NodeToProcess->childGeneration(), + child, child->begin(), child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. @@ -1097,7 +1242,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, namespace { -/// \brief A simple and fast domtree-based CSE pass. +/// A simple and fast domtree-based CSE pass. /// /// This pass does a simple depth-first walk over the dominator tree, /// eliminating trivially redundant instructions and using instsimplify to diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 063df779a30b..117b19fb8a42 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "flattencfg" diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index b105ece8dc7c..f2828e80bc58 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -138,7 +138,7 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { // Helper - mark I as having been traversed, having range R. void Float2IntPass::seen(Instruction *I, ConstantRange R) { - DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); + LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); auto IT = SeenInsts.find(I); if (IT != SeenInsts.end()) IT->second = std::move(R); @@ -359,7 +359,7 @@ bool Float2IntPass::validateAndTransform() { for (User *U : I->users()) { Instruction *UI = dyn_cast<Instruction>(U); if (!UI || SeenInsts.find(UI) == SeenInsts.end()) { - DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n"); + LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n"); Fail = true; break; } @@ -380,7 +380,7 @@ bool Float2IntPass::validateAndTransform() { // lower limits, plus one so it can be signed. unsigned MinBW = std::max(R.getLower().getMinSignedBits(), R.getUpper().getMinSignedBits()) + 1; - DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n"); + LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n"); // If we've run off the realms of the exactly representable integers, // the floating point result will differ from an integer approximation. @@ -391,11 +391,12 @@ bool Float2IntPass::validateAndTransform() { unsigned MaxRepresentableBits = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1; if (MinBW > MaxRepresentableBits) { - DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n"); + LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n"); continue; } if (MinBW > 64) { - DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n"); + LLVM_DEBUG( + dbgs() << "F2I: Value requires more than 64 bits to represent!\n"); continue; } @@ -490,7 +491,7 @@ void Float2IntPass::cleanup() { } bool Float2IntPass::runImpl(Function &F) { - DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); // Clear out all state. ECs = EquivalenceClasses<Instruction*>(); SeenInsts.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index e2c1eaf58e43..1e0a22cb14b3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -38,7 +38,9 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" @@ -69,7 +71,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include <algorithm> @@ -765,6 +766,15 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (SSAUpdate.HasValueForBlock(BB)) continue; + // If the value is the load that we will be eliminating, and the block it's + // available in is the block that the load is in, then don't add it as + // SSAUpdater will resolve the value to the relevant phi which may let it + // avoid phi construction entirely if there's actually only one value. + if (BB == LI->getParent() && + ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) || + (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI))) + continue; + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn)); } @@ -783,9 +793,10 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, if (Res->getType() != LoadTy) { Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL); - DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " - << *getSimpleValue() << '\n' - << *Res << '\n' << "\n\n\n"); + LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset + << " " << *getSimpleValue() << '\n' + << *Res << '\n' + << "\n\n\n"); } } else if (isCoercedLoadValue()) { LoadInst *Load = getCoercedLoadValue(); @@ -799,20 +810,21 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, // but then there all of the operations based on it would need to be // rehashed. Just leave the dead load around. gvn.getMemDep().removeInstruction(Load); - DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " - << *getCoercedLoadValue() << '\n' - << *Res << '\n' - << "\n\n\n"); + LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset + << " " << *getCoercedLoadValue() << '\n' + << *Res << '\n' + << "\n\n\n"); } } else if (isMemIntrinValue()) { Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, InsertPt, DL); - DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset - << " " << *getMemIntrinValue() << '\n' - << *Res << '\n' << "\n\n\n"); + LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + << " " << *getMemIntrinValue() << '\n' + << *Res << '\n' + << "\n\n\n"); } else { assert(isUndefValue() && "Should be UndefVal"); - DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); + LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); return UndefValue::get(LoadTy); } assert(Res && "failed to materialize?"); @@ -825,7 +837,7 @@ static bool isLifetimeStart(const Instruction *Inst) { return false; } -/// \brief Try to locate the three instruction involved in a missed +/// Try to locate the three instruction involved in a missed /// load-elimination case that is due to an intervening store. static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo, DominatorTree *DT, @@ -914,13 +926,11 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, } } // Nothing known about this clobber, have to be conservative - DEBUG( - // fast print dep, using operator<< on instruction is too slow. - dbgs() << "GVN: load "; - LI->printAsOperand(dbgs()); - Instruction *I = DepInfo.getInst(); - dbgs() << " is clobbered by " << *I << '\n'; - ); + LLVM_DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; LI->printAsOperand(dbgs()); + Instruction *I = DepInfo.getInst(); + dbgs() << " is clobbered by " << *I << '\n';); if (ORE->allowExtraAnalysis(DEBUG_TYPE)) reportMayClobberedLoad(LI, DepInfo, DT, ORE); @@ -978,12 +988,10 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, } // Unknown def - must be conservative - DEBUG( - // fast print dep, using operator<< on instruction is too slow. - dbgs() << "GVN: load "; - LI->printAsOperand(dbgs()); - dbgs() << " has unknown def " << *DepInst << '\n'; - ); + LLVM_DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; LI->printAsOperand(dbgs()); + dbgs() << " has unknown def " << *DepInst << '\n';); return false; } @@ -1065,7 +1073,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // It is illegal to move the array access to any point above the guard, // because if the index is out of bounds we should deoptimize rather than // access the array. - // Check that there is no guard in this block above our intruction. + // Check that there is no guard in this block above our instruction. if (!IsSafeToSpeculativelyExecute) { auto It = FirstImplicitControlFlowInsts.find(TmpBB); if (It != FirstImplicitControlFlowInsts.end()) { @@ -1113,9 +1121,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If any predecessor block is an EH pad that does not allow non-PHI // instructions before the terminator, we can't PRE the load. if (Pred->getTerminator()->isEHPad()) { - DEBUG(dbgs() - << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" - << Pred->getName() << "': " << *LI << '\n'); + LLVM_DEBUG( + dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" + << Pred->getName() << "': " << *LI << '\n'); return false; } @@ -1125,15 +1133,16 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (Pred->getTerminator()->getNumSuccessors() != 1) { if (isa<IndirectBrInst>(Pred->getTerminator())) { - DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" - << Pred->getName() << "': " << *LI << '\n'); + LLVM_DEBUG( + dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); return false; } if (LoadBB->isEHPad()) { - DEBUG(dbgs() - << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" - << Pred->getName() << "': " << *LI << '\n'); + LLVM_DEBUG( + dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); return false; } @@ -1161,8 +1170,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!"); PredLoads[NewPred] = nullptr; - DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" - << LoadBB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" + << LoadBB->getName() << '\n'); } // Check if the load can safely be moved to all the unavailable predecessors. @@ -1186,8 +1195,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. if (!LoadPtr) { - DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " - << *LI->getPointerOperand() << "\n"); + LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " + << *LI->getPointerOperand() << "\n"); CanDoPRE = false; break; } @@ -1208,10 +1217,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Okay, we can eliminate this load by inserting a reload in the predecessor // and using PHI construction to get the value in the other predecessors, do // it. - DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n'); - DEBUG(if (!NewInsts.empty()) - dbgs() << "INSERTED " << NewInsts.size() << " INSTS: " - << *NewInsts.back() << '\n'); + LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n'); + LLVM_DEBUG(if (!NewInsts.empty()) dbgs() + << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back() + << '\n'); // Assign value numbers to the new instructions. for (Instruction *I : NewInsts) { @@ -1262,7 +1271,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, NewLoad)); MD->invalidateCachedPointerInfo(LoadPtr); - DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); + LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); } // Perform PHI construction. @@ -1320,11 +1329,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // clobber in the current block. Reject this early. if (NumDeps == 1 && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { - DEBUG( - dbgs() << "GVN: non-local load "; - LI->printAsOperand(dbgs()); - dbgs() << " has unknown dependencies\n"; - ); + LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs()); + dbgs() << " has unknown dependencies\n";); return false; } @@ -1353,7 +1359,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // load, then it is fully redundant and we can use PHI insertion to compute // its value. Insert PHIs and remove the fully redundant value now. if (UnavailableBlocks.empty()) { - DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); + LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); // Perform PHI construction. Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); @@ -1506,12 +1512,10 @@ bool GVN::processLoad(LoadInst *L) { // Only handle the local case below if (!Dep.isDef() && !Dep.isClobber()) { // This might be a NonFuncLocal or an Unknown - DEBUG( - // fast print dep, using operator<< on instruction is too slow. - dbgs() << "GVN: load "; - L->printAsOperand(dbgs()); - dbgs() << " has unknown dependence\n"; - ); + LLVM_DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; L->printAsOperand(dbgs()); + dbgs() << " has unknown dependence\n";); return false; } @@ -1695,8 +1699,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { if (it != ReplaceWithConstMap.end()) { assert(!isa<Constant>(Operand) && "Replacing constants with constants is invalid"); - DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second - << " in instruction " << *Instr << '\n'); + LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " + << *it->second << " in instruction " << *Instr << '\n'); Instr->setOperand(OpNum, it->second); Changed = true; } @@ -2038,7 +2042,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, unsigned Iteration = 0; while (ShouldContinue) { - DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); + LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); Changed |= ShouldContinue; ++Iteration; @@ -2104,9 +2108,10 @@ bool GVN::processBlock(BasicBlock *BB) { const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB); for (auto *I : InstrsToErase) { assert(I->getParent() == BB && "Removing instruction from wrong block?"); - DEBUG(dbgs() << "GVN removed: " << *I << '\n'); + LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n'); + salvageDebugInfo(*I); if (MD) MD->removeInstruction(I); - DEBUG(verifyRemoved(I)); + LLVM_DEBUG(verifyRemoved(I)); if (MaybeFirstICF == I) { // We have erased the first ICF in block. The map needs to be updated. InvalidateImplicitCF = true; @@ -2288,7 +2293,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { PREInstr = CurInst->clone(); if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) { // If we failed insertion, make sure we remove the instruction. - DEBUG(verifyRemoved(PREInstr)); + LLVM_DEBUG(verifyRemoved(PREInstr)); PREInstr->deleteValue(); return false; } @@ -2326,10 +2331,10 @@ bool GVN::performScalarPRE(Instruction *CurInst) { VN.erase(CurInst); removeFromLeaderTable(ValNo, CurInst, CurrentBlock); - DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); if (MD) MD->removeInstruction(CurInst); - DEBUG(verifyRemoved(CurInst)); + LLVM_DEBUG(verifyRemoved(CurInst)); bool InvalidateImplicitCF = FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst; // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp index 026fab5dbd3b..6d2b25cf6013 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -48,6 +48,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -72,7 +73,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <iterator> @@ -534,7 +534,7 @@ private: if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D)) if (auto *UD = dyn_cast<MemoryUseOrDef>(D)) - if (firstInBB(NewPt, UD->getMemoryInst())) + if (!firstInBB(UD->getMemoryInst(), NewPt)) // Cannot move the load or store to NewPt above its definition in D. return false; @@ -570,7 +570,7 @@ private: // The ides is inspired from: // "Partial Redundancy Elimination in SSA Form" // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW - // They use similar idea in the forward graph to to find fully redundant and + // They use similar idea in the forward graph to find fully redundant and // partially redundant expressions, here it is used in the inverse graph to // find fully anticipable instructions at merge point (post-dominator in // the inverse CFG). @@ -578,7 +578,7 @@ private: // Returns true when the values are flowing out to each edge. bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const { - if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end())) + if (TI->getNumSuccessors() > (unsigned)size(C)) return false; // Not enough args in this CHI. for (auto CHI : C) { @@ -622,7 +622,7 @@ private: // Iterate in reverse order to keep lower ranked values on the top. for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) { // Get the value of instruction I - DEBUG(dbgs() << "\nPushing on stack: " << *VI.second); + LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second); RenameStack[VI.first].push_back(VI.second); } } @@ -636,7 +636,7 @@ private: if (P == CHIBBs.end()) { continue; } - DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName();); + LLVM_DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName();); // A CHI is found (BB -> Pred is an edge in the CFG) // Pop the stack until Top(V) = Ve. auto &VCHI = P->second; @@ -651,9 +651,9 @@ private: DT->properlyDominates(Pred, si->second.back()->getParent())) { C.Dest = BB; // Assign the edge C.I = si->second.pop_back_val(); // Assign the argument - DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName() - << *C.I << ", VN: " << C.VN.first << ", " - << C.VN.second); + LLVM_DEBUG(dbgs() + << "\nCHI Inserted in BB: " << C.Dest->getName() << *C.I + << ", VN: " << C.VN.first << ", " << C.VN.second); } // Move to next CHI of a different value It = std::find_if(It, VCHI.end(), @@ -748,11 +748,11 @@ private: // TODO: Remove fully-redundant expressions. // Get instruction from the Map, assume that all the Instructions // with same VNs have same rank (this is an approximation). - std::sort(Ranks.begin(), Ranks.end(), - [this, &Map](const VNType &r1, const VNType &r2) { - return (rank(*Map.lookup(r1).begin()) < - rank(*Map.lookup(r2).begin())); - }); + llvm::sort(Ranks.begin(), Ranks.end(), + [this, &Map](const VNType &r1, const VNType &r2) { + return (rank(*Map.lookup(r1).begin()) < + rank(*Map.lookup(r2).begin())); + }); // - Sort VNs according to their rank, and start with lowest ranked VN // - Take a VN and for each instruction with same VN @@ -798,8 +798,8 @@ private: // Ignore spurious PDFs. if (DT->properlyDominates(IDFB, V[i]->getParent())) { OutValue[IDFB].push_back(C); - DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName() - << ", for Insn: " << *V[i]); + LLVM_DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName() + << ", for Insn: " << *V[i]); } } } @@ -1200,6 +1200,7 @@ INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist", INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist", "Early GVN Hoisting of Expressions", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp index 5594c29bbd9f..28c5940db1e0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -48,6 +48,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -71,7 +72,6 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -239,7 +239,7 @@ public: SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops; for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)}); - std::sort(Ops.begin(), Ops.end()); + llvm::sort(Ops.begin(), Ops.end()); for (auto &P : Ops) { Blocks.push_back(P.first); Values.push_back(P.second); @@ -361,7 +361,7 @@ public: for (auto &U : I->uses()) op_push_back(U.getUser()); - std::sort(op_begin(), op_end()); + llvm::sort(op_begin(), op_end()); } void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } @@ -561,7 +561,8 @@ public: GVNSink() = default; bool run(Function &F) { - DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() + << "\n"); unsigned NumSunk = 0; ReversePostOrderTraversal<Function*> RPOT(&F); @@ -629,15 +630,15 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) { auto Insts = *LRI; - DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I - : Insts) { + LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I + : Insts) { I->dump(); } dbgs() << " ]\n";); DenseMap<uint32_t, unsigned> VNums; for (auto *I : Insts) { uint32_t N = VN.lookupOrAdd(I); - DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n"); + LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n"); if (N == ~0U) return None; VNums[N]++; @@ -749,8 +750,8 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( } unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { - DEBUG(dbgs() << "GVNSink: running on basic block "; - BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "GVNSink: running on basic block "; + BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); SmallVector<BasicBlock *, 4> Preds; for (auto *B : predecessors(BBEnd)) { auto *T = B->getTerminator(); @@ -761,7 +762,7 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { } if (Preds.size() < 2) return 0; - std::sort(Preds.begin(), Preds.end()); + llvm::sort(Preds.begin(), Preds.end()); unsigned NumOrigPreds = Preds.size(); // We can only sink instructions through unconditional branches. @@ -794,23 +795,23 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { Candidates.begin(), Candidates.end(), [](const SinkingInstructionCandidate &A, const SinkingInstructionCandidate &B) { return A > B; }); - DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C - : Candidates) dbgs() - << " " << C << "\n";); + LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C + : Candidates) dbgs() + << " " << C << "\n";); // Pick the top candidate, as long it is positive! if (Candidates.empty() || Candidates.front().Cost <= 0) return 0; auto C = Candidates.front(); - DEBUG(dbgs() << " -- Sinking: " << C << "\n"); + LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n"); BasicBlock *InsertBB = BBEnd; if (C.Blocks.size() < NumOrigPreds) { - DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs()); - dbgs() << "\n"); + LLVM_DEBUG(dbgs() << " -- Splitting edge to "; + BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split"); if (!InsertBB) { - DEBUG(dbgs() << " -- FAILED to split edge!\n"); + LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n"); // Edge couldn't be split. return 0; } diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp index c4aeccb85ca7..ad1598d7b8bf 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -40,9 +40,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/GuardWidening.h" +#include <functional> #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" @@ -53,6 +55,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -62,9 +65,14 @@ namespace { class GuardWideningImpl { DominatorTree &DT; - PostDominatorTree &PDT; + PostDominatorTree *PDT; LoopInfo &LI; + /// Together, these describe the region of interest. This might be all of + /// the blocks within a function, or only a given loop's blocks and preheader. + DomTreeNode *Root; + std::function<bool(BasicBlock*)> BlockFilter; + /// The set of guards whose conditions have been widened into dominating /// guards. SmallVector<IntrinsicInst *, 16> EliminatedGuards; @@ -205,39 +213,15 @@ class GuardWideningImpl { } public: - explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT, - LoopInfo &LI) - : DT(DT), PDT(PDT), LI(LI) {} + + explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, + LoopInfo &LI, DomTreeNode *Root, + std::function<bool(BasicBlock*)> BlockFilter) + : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {} /// The entry point for this pass. bool run(); }; - -struct GuardWideningLegacyPass : public FunctionPass { - static char ID; - GuardWideningPass Impl; - - GuardWideningLegacyPass() : FunctionPass(ID) { - initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - return GuardWideningImpl( - getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(), - getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run(); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - } -}; - } bool GuardWideningImpl::run() { @@ -246,9 +230,12 @@ bool GuardWideningImpl::run() { DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock; bool Changed = false; - for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode()); + for (auto DFI = df_begin(Root), DFE = df_end(Root); DFI != DFE; ++DFI) { auto *BB = (*DFI)->getBlock(); + if (!BlockFilter(BB)) + continue; + auto &CurrentList = GuardsInBlock[BB]; for (auto &I : *BB) @@ -259,6 +246,7 @@ bool GuardWideningImpl::run() { Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock); } + assert(EliminatedGuards.empty() || Changed); for (auto *II : EliminatedGuards) if (!WidenedGuards.count(II)) II->eraseFromParent(); @@ -278,6 +266,8 @@ bool GuardWideningImpl::eliminateGuardViaWidening( // for the most profit. for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) { auto *CurBB = DFSI.getPath(i)->getBlock(); + if (!BlockFilter(CurBB)) + break; auto *CurLoop = LI.getLoopFor(CurBB); assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!"); const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second; @@ -312,9 +302,9 @@ bool GuardWideningImpl::eliminateGuardViaWidening( for (auto *Candidate : make_range(I, E)) { auto Score = computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop); - DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0) - << " and " << *Candidate->getArgOperand(0) << " is " - << scoreTypeToString(Score) << "\n"); + LLVM_DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0) + << " and " << *Candidate->getArgOperand(0) << " is " + << scoreTypeToString(Score) << "\n"); if (Score > BestScoreSoFar) { BestScoreSoFar = Score; BestSoFar = Candidate; @@ -323,15 +313,16 @@ bool GuardWideningImpl::eliminateGuardViaWidening( } if (BestScoreSoFar == WS_IllegalOrNegative) { - DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n"); + LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n"); return false; } assert(BestSoFar != GuardInst && "Should have never visited same guard!"); assert(DT.dominates(BestSoFar, GuardInst) && "Should be!"); - DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar - << " with score " << scoreTypeToString(BestScoreSoFar) << "\n"); + LLVM_DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar + << " with score " << scoreTypeToString(BestScoreSoFar) + << "\n"); widenGuard(BestSoFar, GuardInst->getArgOperand(0)); GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext())); EliminatedGuards.push_back(GuardInst); @@ -345,6 +336,8 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( bool HoistingOutOfLoop = false; if (DominatingGuardLoop != DominatedGuardLoop) { + // Be conservative and don't widen into a sibling loop. TODO: If the + // sibling is colder, we should consider allowing this. if (DominatingGuardLoop && !DominatingGuardLoop->contains(DominatedGuardLoop)) return WS_IllegalOrNegative; @@ -355,9 +348,14 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard)) return WS_IllegalOrNegative; - bool HoistingOutOfIf = - !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent()); - + // If the guard was conditional executed, it may never be reached + // dynamically. There are two potential downsides to hoisting it out of the + // conditionally executed region: 1) we may spuriously deopt without need and + // 2) we have the extra cost of computing the guard condition in the common + // case. At the moment, we really only consider the second in our heuristic + // here. TODO: evaluate cost model for spurious deopt + // NOTE: As written, this also lets us hoist right over another guard which + // is essentially just another spelling for control flow. if (isWideningCondProfitable(DominatedGuard->getArgOperand(0), DominatingGuard->getArgOperand(0))) return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive; @@ -365,7 +363,26 @@ GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( if (HoistingOutOfLoop) return WS_Positive; - return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral; + // Returns true if we might be hoisting above explicit control flow. Note + // that this completely ignores implicit control flow (guards, calls which + // throw, etc...). That choice appears arbitrary. + auto MaybeHoistingOutOfIf = [&]() { + auto *DominatingBlock = DominatingGuard->getParent(); + auto *DominatedBlock = DominatedGuard->getParent(); + + // Same Block? + if (DominatedBlock == DominatingBlock) + return false; + // Obvious successor (common loop header/preheader case) + if (DominatedBlock == DominatingBlock->getUniqueSuccessor()) + return false; + // TODO: diamond, triangle cases + if (!PDT) return true; + return !PDT->dominates(DominatedGuard->getParent(), + DominatingGuard->getParent()); + }; + + return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral; } bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc, @@ -581,9 +598,9 @@ bool GuardWideningImpl::combineRangeChecks( // CurrentChecks.size() will typically be 3 here, but so far there has been // no need to hard-code that fact. - std::sort(CurrentChecks.begin(), CurrentChecks.end(), - [&](const GuardWideningImpl::RangeCheck &LHS, - const GuardWideningImpl::RangeCheck &RHS) { + llvm::sort(CurrentChecks.begin(), CurrentChecks.end(), + [&](const GuardWideningImpl::RangeCheck &LHS, + const GuardWideningImpl::RangeCheck &RHS) { return LHS.getOffsetValue().slt(RHS.getOffsetValue()); }); @@ -651,19 +668,6 @@ bool GuardWideningImpl::combineRangeChecks( return RangeChecksOut.size() != OldCount; } -PreservedAnalyses GuardWideningPass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &LI = AM.getResult<LoopAnalysis>(F); - auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - if (!GuardWideningImpl(DT, PDT, LI).run()) - return PreservedAnalyses::all(); - - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; -} - #ifndef NDEBUG StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { switch (WS) { @@ -681,7 +685,82 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { } #endif +PreservedAnalyses GuardWideningPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); + if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), + [](BasicBlock*) { return true; } ).run()) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} + +namespace { +struct GuardWideningLegacyPass : public FunctionPass { + static char ID; + + GuardWideningLegacyPass() : FunctionPass(ID) { + initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), + [](BasicBlock*) { return true; } ).run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + } +}; + +/// Same as above, but restricted to a single loop at a time. Can be +/// scheduled with other loop passes w/o breaking out of LPM +struct LoopGuardWideningLegacyPass : public LoopPass { + static char ID; + + LoopGuardWideningLegacyPass() : LoopPass(ID) { + initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); + auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; + BasicBlock *RootBB = L->getLoopPredecessor(); + if (!RootBB) + RootBB = L->getHeader(); + auto BlockFilter = [&](BasicBlock *BB) { + return BB == RootBB || L->contains(BB); + }; + return GuardWideningImpl(DT, PDT, LI, + DT.getNode(RootBB), BlockFilter).run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + getLoopAnalysisUsage(AU); + AU.addPreserved<PostDominatorTreeWrapperPass>(); + } +}; +} + char GuardWideningLegacyPass::ID = 0; +char LoopGuardWideningLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", false, false) @@ -691,6 +770,20 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", false, false) +INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening", + "Widen guards (within a single loop, as a loop pass)", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening", + "Widen guards (within a single loop, as a loop pass)", + false, false) + FunctionPass *llvm::createGuardWideningPass() { return new GuardWideningLegacyPass(); } + +Pass *llvm::createLoopGuardWideningPass() { + return new LoopGuardWideningLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 221fe57581ca..8656e88b79cb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -43,6 +43,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" @@ -77,7 +78,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include <cassert> @@ -210,8 +210,8 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { if (FromBase == ToBase) return true; - DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " - << *FromBase << " != " << *ToBase << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " << *FromBase + << " != " << *ToBase << "\n"); return false; } @@ -653,8 +653,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { Value *ExitVal = expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType()); - DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' - << " LoopVal = " << *Inst << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal + << '\n' + << " LoopVal = " << *Inst << "\n"); if (!isValidRewrite(Inst, ExitVal)) { DeadInsts.push_back(ExitVal); @@ -1084,7 +1085,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) { Instruction *NarrowDef = DU.NarrowDef; Instruction *WideDef = DU.WideDef; - DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); + LLVM_DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything // about the narrow operand yet so must insert a [sz]ext. It is probably loop @@ -1115,7 +1116,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU, Instruction *NarrowDef = DU.NarrowDef; Instruction *WideDef = DU.WideDef; - DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1; @@ -1315,8 +1316,8 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(NarrowIVDefUse DU) { /// This IV user cannot be widen. Replace this use of the original narrow IV /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) { - DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef - << " for user " << *DU.NarrowUse << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user " + << *DU.NarrowUse << "\n"); IRBuilder<> Builder( getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); @@ -1396,8 +1397,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); UsePhi->replaceAllUsesWith(Trunc); DeadInsts.emplace_back(UsePhi); - DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi - << " to " << *WidePhi << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to " + << *WidePhi << "\n"); } return nullptr; } @@ -1428,15 +1429,16 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // A wider extend was hidden behind a narrower one. This may induce // another round of IV widening in which the intermediate IV becomes // dead. It should be very rare. - DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi - << " not wide enough to subsume " << *DU.NarrowUse << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi + << " not wide enough to subsume " << *DU.NarrowUse + << "\n"); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); NewDef = DU.NarrowUse; } } if (NewDef != DU.NarrowUse) { - DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse - << " replaced by " << *DU.WideDef << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse + << " replaced by " << *DU.WideDef << "\n"); ++NumElimExt; DU.NarrowUse->replaceAllUsesWith(NewDef); DeadInsts.emplace_back(DU.NarrowUse); @@ -1491,8 +1493,9 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // absolutely guarantee it. Hence the following failsafe check. In rare cases // where it fails, we simply throw away the newly created wide use. if (WideAddRec.first != SE->getSCEV(WideUse)) { - DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse - << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first << "\n"); + LLVM_DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse << ": " + << *SE->getSCEV(WideUse) << " != " << *WideAddRec.first + << "\n"); DeadInsts.emplace_back(WideUse); return nullptr; } @@ -1597,7 +1600,7 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { WideInc->setDebugLoc(OrigInc->getDebugLoc()); } - DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n"); + LLVM_DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n"); ++NumWidened; // Traverse the def-use chain using a worklist starting at the original IV. @@ -2231,12 +2234,12 @@ linearFunctionTestReplace(Loop *L, else P = ICmpInst::ICMP_EQ; - DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" - << " LHS:" << *CmpIndVar << '\n' - << " op:\t" - << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n" - << " RHS:\t" << *ExitCnt << "\n" - << " IVCount:\t" << *IVCount << "\n"); + LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" + << " LHS:" << *CmpIndVar << '\n' + << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==") + << "\n" + << " RHS:\t" << *ExitCnt << "\n" + << " IVCount:\t" << *IVCount << "\n"); IRBuilder<> Builder(BI); @@ -2272,7 +2275,7 @@ linearFunctionTestReplace(Loop *L, NewLimit = Start + Count; ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit); - DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n"); + LLVM_DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n"); } else { // We try to extend trip count first. If that doesn't work we truncate IV. // Zext(trunc(IV)) == IV implies equivalence of the following two: diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index cf98088111be..e2f29705f2dd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -43,6 +43,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/None.h" @@ -52,6 +53,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -179,10 +181,7 @@ public: OS << " Step: "; Step->print(OS); OS << " End: "; - if (End) - End->print(OS); - else - OS << "(null)"; + End->print(OS); OS << "\n CheckUse: "; getCheckUse()->getUser()->print(OS); OS << " Operand: " << getCheckUse()->getOperandNo() << "\n"; @@ -196,7 +195,7 @@ public: Use *getCheckUse() const { return CheckUse; } /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If - /// R.getEnd() sle R.getBegin(), then R denotes the empty range. + /// R.getEnd() le R.getBegin(), then R denotes the empty range. class Range { const SCEV *Begin; @@ -238,17 +237,31 @@ public: /// checks, and hence don't end up in \p Checks. static void extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE, - BranchProbabilityInfo &BPI, + BranchProbabilityInfo *BPI, SmallVectorImpl<InductiveRangeCheck> &Checks); }; -class InductiveRangeCheckElimination : public LoopPass { +class InductiveRangeCheckElimination { + ScalarEvolution &SE; + BranchProbabilityInfo *BPI; + DominatorTree &DT; + LoopInfo &LI; + +public: + InductiveRangeCheckElimination(ScalarEvolution &SE, + BranchProbabilityInfo *BPI, DominatorTree &DT, + LoopInfo &LI) + : SE(SE), BPI(BPI), DT(DT), LI(LI) {} + + bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop); +}; + +class IRCELegacyPass : public LoopPass { public: static char ID; - InductiveRangeCheckElimination() : LoopPass(ID) { - initializeInductiveRangeCheckEliminationPass( - *PassRegistry::getPassRegistry()); + IRCELegacyPass() : LoopPass(ID) { + initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -261,14 +274,14 @@ public: } // end anonymous namespace -char InductiveRangeCheckElimination::ID = 0; +char IRCELegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", +INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce", "Inductive range check elimination", false, false) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopPass) -INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", - "Inductive range check elimination", false, false) +INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination", + false, false) StringRef InductiveRangeCheck::rangeCheckKindToStr( InductiveRangeCheck::RangeCheckKind RCK) { @@ -299,13 +312,8 @@ InductiveRangeCheck::RangeCheckKind InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE, Value *&Index, Value *&Length, bool &IsSigned) { - auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) { - const SCEV *S = SE.getSCEV(V); - if (isa<SCEVCouldNotCompute>(S)) - return false; - - return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant && - SE.isKnownNonNegative(S); + auto IsLoopInvariant = [&SE, L](Value *V) { + return SE.isLoopInvariant(SE.getSCEV(V), L); }; ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -337,7 +345,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, return RANGE_CHECK_LOWER; } - if (IsNonNegativeAndNotLoopVarying(LHS)) { + if (IsLoopInvariant(LHS)) { Index = RHS; Length = LHS; return RANGE_CHECK_UPPER; @@ -349,7 +357,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, LLVM_FALLTHROUGH; case ICmpInst::ICMP_UGT: IsSigned = false; - if (IsNonNegativeAndNotLoopVarying(LHS)) { + if (IsLoopInvariant(LHS)) { Index = RHS; Length = LHS; return RANGE_CHECK_BOTH; @@ -394,8 +402,23 @@ void InductiveRangeCheck::extractRangeChecksFromCond( if (!IsAffineIndex) return; + const SCEV *End = nullptr; + // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". + // We can potentially do much better here. + if (Length) + End = SE.getSCEV(Length); + else { + assert(RCKind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!"); + // So far we can only reach this point for Signed range check. This may + // change in future. In this case we will need to pick Unsigned max for the + // unsigned range check. + unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth(); + const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); + End = SIntMax; + } + InductiveRangeCheck IRC; - IRC.End = Length ? SE.getSCEV(Length) : nullptr; + IRC.End = End; IRC.Begin = IndexAddRec->getStart(); IRC.Step = IndexAddRec->getStepRecurrence(SE); IRC.CheckUse = &ConditionUse; @@ -405,15 +428,15 @@ void InductiveRangeCheck::extractRangeChecksFromCond( } void InductiveRangeCheck::extractRangeChecksFromBranch( - BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI, + BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, SmallVectorImpl<InductiveRangeCheck> &Checks) { if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) return; BranchProbability LikelyTaken(15, 16); - if (!SkipProfitabilityChecks && - BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken) + if (!SkipProfitabilityChecks && BPI && + BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken) return; SmallPtrSet<Value *, 8> Visited; @@ -504,9 +527,8 @@ struct LoopStructure { } static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, - BranchProbabilityInfo &BPI, - Loop &, - const char *&); + BranchProbabilityInfo *BPI, + Loop &, const char *&); }; /// This class is used to constrain loops to run within a given iteration space. @@ -573,7 +595,7 @@ class LoopConstrainer { // Create the appropriate loop structure needed to describe a cloned copy of // `Original`. The clone is described by `VM`. Loop *createClonedLoopStructure(Loop *Original, Loop *Parent, - ValueToValueMapTy &VM); + ValueToValueMapTy &VM, bool IsSubloop); // Rewrite the iteration space of the loop denoted by (LS, Preheader). The // iteration space of the rewritten loop ends at ExitLoopAt. The start of the @@ -625,8 +647,8 @@ class LoopConstrainer { LLVMContext &Ctx; ScalarEvolution &SE; DominatorTree &DT; - LPPassManager &LPM; LoopInfo &LI; + function_ref<void(Loop *, bool)> LPMAddNewLoop; // Information about the original loop we started out with. Loop &OriginalLoop; @@ -646,12 +668,13 @@ class LoopConstrainer { LoopStructure MainLoopStructure; public: - LoopConstrainer(Loop &L, LoopInfo &LI, LPPassManager &LPM, + LoopConstrainer(Loop &L, LoopInfo &LI, + function_ref<void(Loop *, bool)> LPMAddNewLoop, const LoopStructure &LS, ScalarEvolution &SE, DominatorTree &DT, InductiveRangeCheck::Range R) : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), - SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R), - MainLoopStructure(LS) {} + SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L), + Range(R), MainLoopStructure(LS) {} // Entry point for the algorithm. Returns true on success. bool run(); @@ -666,56 +689,141 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, PN->setIncomingBlock(i, ReplaceBy); } -static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) { - APInt Max = Signed ? - APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) : - APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); - return SE.getSignedRange(S).contains(Max) && - SE.getUnsignedRange(S).contains(Max); +static bool CannotBeMaxInLoop(const SCEV *BoundSCEV, Loop *L, + ScalarEvolution &SE, bool Signed) { + unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); + APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) : + APInt::getMaxValue(BitWidth); + auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; + return SE.isAvailableAtLoopEntry(BoundSCEV, L) && + SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV, + SE.getConstant(Max)); +} + +/// Given a loop with an deccreasing induction variable, is it possible to +/// safely calculate the bounds of a new loop using the given Predicate. +static bool isSafeDecreasingBound(const SCEV *Start, + const SCEV *BoundSCEV, const SCEV *Step, + ICmpInst::Predicate Pred, + unsigned LatchBrExitIdx, + Loop *L, ScalarEvolution &SE) { + if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT && + Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT) + return false; + + if (!SE.isAvailableAtLoopEntry(BoundSCEV, L)) + return false; + + assert(SE.isKnownNegative(Step) && "expecting negative step"); + + LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n"); + LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n"); + LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n"); + LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n"); + LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred) + << "\n"); + LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n"); + + bool IsSigned = ICmpInst::isSigned(Pred); + // The predicate that we need to check that the induction variable lies + // within bounds. + ICmpInst::Predicate BoundPred = + IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT; + + if (LatchBrExitIdx == 1) + return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV); + + assert(LatchBrExitIdx == 0 && + "LatchBrExitIdx should be either 0 or 1"); + + const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType())); + unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); + APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) : + APInt::getMinValue(BitWidth); + const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne); + + const SCEV *MinusOne = + SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType())); + + return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) && + SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit); + +} + +/// Given a loop with an increasing induction variable, is it possible to +/// safely calculate the bounds of a new loop using the given Predicate. +static bool isSafeIncreasingBound(const SCEV *Start, + const SCEV *BoundSCEV, const SCEV *Step, + ICmpInst::Predicate Pred, + unsigned LatchBrExitIdx, + Loop *L, ScalarEvolution &SE) { + if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT && + Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT) + return false; + + if (!SE.isAvailableAtLoopEntry(BoundSCEV, L)) + return false; + + LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n"); + LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n"); + LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n"); + LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n"); + LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred) + << "\n"); + LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n"); + + bool IsSigned = ICmpInst::isSigned(Pred); + // The predicate that we need to check that the induction variable lies + // within bounds. + ICmpInst::Predicate BoundPred = + IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; + + if (LatchBrExitIdx == 1) + return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV); + + assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1"); + + const SCEV *StepMinusOne = + SE.getMinusSCEV(Step, SE.getOne(Step->getType())); + unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); + APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) : + APInt::getMaxValue(BitWidth); + const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne); + + return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start, + SE.getAddExpr(BoundSCEV, Step)) && + SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit)); } -static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2, - bool Signed) { - // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX. - assert(SE.isKnownNonNegative(S2) && - "We expected the 2nd arg to be non-negative!"); - const SCEV *Max = SE.getConstant( - Signed ? APInt::getSignedMaxValue( - cast<IntegerType>(S1->getType())->getBitWidth()) - : APInt::getMaxValue( - cast<IntegerType>(S1->getType())->getBitWidth())); - const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2); - return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - S1, CapForS1); +static bool CannotBeMinInLoop(const SCEV *BoundSCEV, Loop *L, + ScalarEvolution &SE, bool Signed) { + unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth(); + APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) : + APInt::getMinValue(BitWidth); + auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; + return SE.isAvailableAtLoopEntry(BoundSCEV, L) && + SE.isLoopEntryGuardedByCond(L, Predicate, BoundSCEV, + SE.getConstant(Min)); } -static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) { - APInt Min = Signed ? - APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) : - APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth()); - return SE.getSignedRange(S).contains(Min) && - SE.getUnsignedRange(S).contains(Min); +static bool isKnownNonNegativeInLoop(const SCEV *BoundSCEV, const Loop *L, + ScalarEvolution &SE) { + const SCEV *Zero = SE.getZero(BoundSCEV->getType()); + return SE.isAvailableAtLoopEntry(BoundSCEV, L) && + SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, BoundSCEV, Zero); } -static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2, - bool Signed) { - // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN. - assert(SE.isKnownNonPositive(S2) && - "We expected the 2nd arg to be non-positive!"); - const SCEV *Max = SE.getConstant( - Signed ? APInt::getSignedMinValue( - cast<IntegerType>(S1->getType())->getBitWidth()) - : APInt::getMinValue( - cast<IntegerType>(S1->getType())->getBitWidth())); - const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2); - return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, - S1, CapForS1); +static bool isKnownNegativeInLoop(const SCEV *BoundSCEV, const Loop *L, + ScalarEvolution &SE) { + const SCEV *Zero = SE.getZero(BoundSCEV->getType()); + return SE.isAvailableAtLoopEntry(BoundSCEV, L) && + SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, BoundSCEV, Zero); } Optional<LoopStructure> LoopStructure::parseLoopStructure(ScalarEvolution &SE, - BranchProbabilityInfo &BPI, - Loop &L, const char *&FailureReason) { + BranchProbabilityInfo *BPI, Loop &L, + const char *&FailureReason) { if (!L.isLoopSimplifyForm()) { FailureReason = "loop not in LoopSimplify form"; return None; @@ -750,7 +858,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0; BranchProbability ExitProbability = - BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx); + BPI ? BPI->getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx) + : BranchProbability::getZero(); if (!SkipProfitabilityChecks && ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) { @@ -816,43 +925,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap; }; - // Here we check whether the suggested AddRec is an induction variable that - // can be handled (i.e. with known constant step), and if yes, calculate its - // step and identify whether it is increasing or decreasing. - auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing, - ConstantInt *&StepCI) { - if (!AR->isAffine()) - return false; - - // Currently we only work with induction variables that have been proved to - // not wrap. This restriction can potentially be lifted in the future. - - if (!HasNoSignedWrap(AR)) - return false; - - if (const SCEVConstant *StepExpr = - dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) { - StepCI = StepExpr->getValue(); - assert(!StepCI->isZero() && "Zero step?"); - IsIncreasing = !StepCI->isNegative(); - return true; - } - - return false; - }; - // `ICI` is interpreted as taking the backedge if the *next* value of the // induction variable satisfies some constraint. const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV); - bool IsIncreasing = false; - bool IsSignedPredicate = true; - ConstantInt *StepCI; - if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) { + if (!IndVarBase->isAffine()) { FailureReason = "LHS in icmp not induction variable"; return None; } + const SCEV* StepRec = IndVarBase->getStepRecurrence(SE); + if (!isa<SCEVConstant>(StepRec)) { + FailureReason = "LHS in icmp not induction variable"; + return None; + } + ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue(); + + if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) { + FailureReason = "LHS in icmp needs nsw for equality predicates"; + return None; + } + assert(!StepCI->isZero() && "Zero step?"); + bool IsIncreasing = !StepCI->isNegative(); + bool IsSignedPredicate = ICmpInst::isSigned(Pred); const SCEV *StartNext = IndVarBase->getStart(); const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE)); const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); @@ -870,22 +965,29 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, // If both parts are known non-negative, it is profitable to use // unsigned comparison in increasing loop. This allows us to make the // comparison check against "RightSCEV + 1" more optimistic. - if (SE.isKnownNonNegative(IndVarStart) && - SE.isKnownNonNegative(RightSCEV)) + if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) && + isKnownNonNegativeInLoop(RightSCEV, &L, SE)) Pred = ICmpInst::ICMP_ULT; else Pred = ICmpInst::ICMP_SLT; - else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && - !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) { + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) { // while (true) { while (true) { // if (++i == len) ---> if (++i > len - 1) // break; break; // ... ... // } } - // TODO: Insert ICMP_UGT if both are non-negative? - Pred = ICmpInst::ICMP_SGT; - RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); - DecreasedRightValueByOne = true; + if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) && + CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) { + Pred = ICmpInst::ICMP_UGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, + SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } else if (CannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) { + Pred = ICmpInst::ICMP_SGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, + SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } } } @@ -899,36 +1001,18 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, return None; } - IsSignedPredicate = - Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT; - + IsSignedPredicate = ICmpInst::isSigned(Pred); if (!IsSignedPredicate && !AllowUnsignedLatchCondition) { FailureReason = "unsigned latch conditions are explicitly prohibited"; return None; } - // The predicate that we need to check that the induction variable lies - // within bounds. - ICmpInst::Predicate BoundPred = - IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; - + if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred, + LatchBrExitIdx, &L, SE)) { + FailureReason = "Unsafe loop bounds"; + return None; + } if (LatchBrExitIdx == 0) { - const SCEV *StepMinusOne = SE.getMinusSCEV(Step, - SE.getOne(Step->getType())); - if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) { - // TODO: this restriction is easily removable -- we just have to - // remember that the icmp was an slt and not an sle. - FailureReason = "limit may overflow when coercing le to lt"; - return None; - } - - if (!SE.isLoopEntryGuardedByCond( - &L, BoundPred, IndVarStart, - SE.getAddExpr(RightSCEV, Step))) { - FailureReason = "Induction variable start not bounded by upper limit"; - return None; - } - // We need to increase the right value unless we have already decreased // it virtually when we replaced EQ with SGT. if (!DecreasedRightValueByOne) { @@ -936,10 +1020,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, RightValue = B.CreateAdd(RightValue, One); } } else { - if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) { - FailureReason = "Induction variable start not bounded by upper limit"; - return None; - } assert(!DecreasedRightValueByOne && "Right value can be decreased only for LatchBrExitIdx == 0!"); } @@ -955,17 +1035,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, // that both operands are non-negative, because it will only pessimize // our check against "RightSCEV - 1". Pred = ICmpInst::ICMP_SGT; - else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && - !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) { + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) { // while (true) { while (true) { // if (--i == len) ---> if (--i < len + 1) // break; break; // ... ... // } } - // TODO: Insert ICMP_ULT if both are non-negative? - Pred = ICmpInst::ICMP_SLT; - RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); - IncreasedRightValueByOne = true; + if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) && + CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) { + Pred = ICmpInst::ICMP_ULT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } else if (CannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) { + Pred = ICmpInst::ICMP_SLT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } } } @@ -988,27 +1073,13 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, return None; } - // The predicate that we need to check that the induction variable lies - // within bounds. - ICmpInst::Predicate BoundPred = - IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT; + if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred, + LatchBrExitIdx, &L, SE)) { + FailureReason = "Unsafe bounds"; + return None; + } if (LatchBrExitIdx == 0) { - const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType())); - if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) { - // TODO: this restriction is easily removable -- we just have to - // remember that the icmp was an sgt and not an sge. - FailureReason = "limit may overflow when coercing ge to gt"; - return None; - } - - if (!SE.isLoopEntryGuardedByCond( - &L, BoundPred, IndVarStart, - SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) { - FailureReason = "Induction variable start not bounded by lower limit"; - return None; - } - // We need to decrease the right value unless we have already increased // it virtually when we replaced EQ with SLT. if (!IncreasedRightValueByOne) { @@ -1016,10 +1087,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, RightValue = B.CreateSub(RightValue, One); } } else { - if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) { - FailureReason = "Induction variable start not bounded by lower limit"; - return None; - } assert(!IncreasedRightValueByOne && "Right value can be increased only for LatchBrExitIdx == 0!"); } @@ -1381,13 +1448,14 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { } Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, - ValueToValueMapTy &VM) { + ValueToValueMapTy &VM, + bool IsSubloop) { Loop &New = *LI.AllocateLoop(); if (Parent) Parent->addChildLoop(&New); else LI.addTopLevelLoop(&New); - LPM.addLoop(New); + LPMAddNewLoop(&New, IsSubloop); // Add all of the blocks in Original to the new loop. for (auto *BB : Original->blocks()) @@ -1396,7 +1464,7 @@ Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, // Add all of the subloops to the new loop. for (Loop *SubLoop : *Original) - createClonedLoopStructure(SubLoop, &New, VM); + createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true); return &New; } @@ -1414,7 +1482,7 @@ bool LoopConstrainer::run() { bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate; Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate); if (!MaybeSR.hasValue()) { - DEBUG(dbgs() << "irce: could not compute subranges\n"); + LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n"); return false; } @@ -1446,19 +1514,22 @@ bool LoopConstrainer::run() { if (Increasing) ExitPreLoopAtSCEV = *SR.LowLimit; else { - if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) { - DEBUG(dbgs() << "irce: could not prove no-overflow when computing " - << "preloop exit limit. HighLimit = " << *(*SR.HighLimit) - << "\n"); + if (CannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE, + IsSignedPredicate)) + ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); + else { + LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "preloop exit limit. HighLimit = " + << *(*SR.HighLimit) << "\n"); return false; } - ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); } if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) { - DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" - << " preloop exit limit " << *ExitPreLoopAtSCEV - << " at block " << InsertPt->getParent()->getName() << "\n"); + LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" + << " preloop exit limit " << *ExitPreLoopAtSCEV + << " at block " << InsertPt->getParent()->getName() + << "\n"); return false; } @@ -1472,19 +1543,22 @@ bool LoopConstrainer::run() { if (Increasing) ExitMainLoopAtSCEV = *SR.HighLimit; else { - if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) { - DEBUG(dbgs() << "irce: could not prove no-overflow when computing " - << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit) - << "\n"); + if (CannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE, + IsSignedPredicate)) + ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); + else { + LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "mainloop exit limit. LowLimit = " + << *(*SR.LowLimit) << "\n"); return false; } - ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); } if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) { - DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" - << " main loop exit limit " << *ExitMainLoopAtSCEV - << " at block " << InsertPt->getParent()->getName() << "\n"); + LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the" + << " main loop exit limit " << *ExitMainLoopAtSCEV + << " at block " << InsertPt->getParent()->getName() + << "\n"); return false; } @@ -1546,13 +1620,15 @@ bool LoopConstrainer::run() { // LI when LoopSimplifyForm is generated. Loop *PreL = nullptr, *PostL = nullptr; if (!PreLoop.Blocks.empty()) { - PreL = createClonedLoopStructure( - &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map); + PreL = createClonedLoopStructure(&OriginalLoop, + OriginalLoop.getParentLoop(), PreLoop.Map, + /* IsSubLoop */ false); } if (!PostLoop.Blocks.empty()) { - PostL = createClonedLoopStructure( - &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map); + PostL = + createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(), + PostLoop.Map, /* IsSubLoop */ false); } // This function canonicalizes the loop into Loop-Simplify and LCSSA forms. @@ -1618,32 +1694,34 @@ InductiveRangeCheck::computeSafeIterationSpace( unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth(); const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); - // Substract Y from X so that it does not go through border of the IV + // Subtract Y from X so that it does not go through border of the IV // iteration space. Mathematically, it is equivalent to: // - // ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1] + // ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1] // - // In [1], 'X - Y' is a mathematical substraction (result is not bounded to + // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to // any width of bit grid). But after we take min/max, the result is // guaranteed to be within [INT_MIN, INT_MAX]. // // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min // values, depending on type of latch condition that defines IV iteration // space. - auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) { - assert(SE.isKnownNonNegative(X) && - "We can only substract from values in [0; SINT_MAX]!"); + auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) { + // FIXME: The current implementation assumes that X is in [0, SINT_MAX]. + // This is required to ensure that SINT_MAX - X does not overflow signed and + // that X - Y does not overflow unsigned if Y is negative. Can we lift this + // restriction and make it work for negative X either? if (IsLatchSigned) { // X is a number from signed range, Y is interpreted as signed. // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only // thing we should care about is that we didn't cross SINT_MAX. - // So, if Y is positive, we substract Y safely. + // So, if Y is positive, we subtract Y safely. // Rule 1: Y > 0 ---> Y. - // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely. + // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely. // Rule 2: Y >=s (X - SINT_MAX) ---> Y. - // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX). + // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX). // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX). - // It gives us smax(Y, X - SINT_MAX) to substract in all cases. + // It gives us smax(Y, X - SINT_MAX) to subtract in all cases. const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax); return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax), SCEV::FlagNSW); @@ -1651,29 +1729,45 @@ InductiveRangeCheck::computeSafeIterationSpace( // X is a number from unsigned range, Y is interpreted as signed. // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only // thing we should care about is that we didn't cross zero. - // So, if Y is negative, we substract Y safely. + // So, if Y is negative, we subtract Y safely. // Rule 1: Y <s 0 ---> Y. - // If 0 <= Y <= X, we substract Y safely. + // If 0 <= Y <= X, we subtract Y safely. // Rule 2: Y <=s X ---> Y. - // If 0 <= X < Y, we should stop at 0 and can only substract X. + // If 0 <= X < Y, we should stop at 0 and can only subtract X. // Rule 3: Y >s X ---> X. - // It gives us smin(X, Y) to substract in all cases. + // It gives us smin(X, Y) to subtract in all cases. return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW); }; const SCEV *M = SE.getMinusSCEV(C, A); const SCEV *Zero = SE.getZero(M->getType()); - const SCEV *Begin = ClampedSubstract(Zero, M); - const SCEV *L = nullptr; - // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". - // We can potentially do much better here. - if (const SCEV *EndLimit = getEnd()) - L = EndLimit; - else { - assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!"); - L = SIntMax; - } - const SCEV *End = ClampedSubstract(L, M); + // This function returns SCEV equal to 1 if X is non-negative 0 otherwise. + auto SCEVCheckNonNegative = [&](const SCEV *X) { + const Loop *L = IndVar->getLoop(); + const SCEV *One = SE.getOne(X->getType()); + // Can we trivially prove that X is a non-negative or negative value? + if (isKnownNonNegativeInLoop(X, L, SE)) + return One; + else if (isKnownNegativeInLoop(X, L, SE)) + return Zero; + // If not, we will have to figure it out during the execution. + // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0. + const SCEV *NegOne = SE.getNegativeSCEV(One); + return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One); + }; + // FIXME: Current implementation of ClampedSubtract implicitly assumes that + // X is non-negative (in sense of a signed value). We need to re-implement + // this function in a way that it will correctly handle negative X as well. + // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can + // end up with a negative X and produce wrong results. So currently we ensure + // that if getEnd() is negative then both ends of the safe range are zero. + // Note that this may pessimize elimination of unsigned range checks against + // negative values. + const SCEV *REnd = getEnd(); + const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd); + + const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative); + const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative); return InductiveRangeCheck::Range(Begin, End); } @@ -1735,26 +1829,56 @@ IntersectUnsignedRange(ScalarEvolution &SE, return Ret; } -bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { +PreservedAnalyses IRCEPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + Function *F = L.getHeader()->getParent(); + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); + auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F); + InductiveRangeCheckElimination IRCE(AR.SE, BPI, AR.DT, AR.LI); + auto LPMAddNewLoop = [&U](Loop *NL, bool IsSubloop) { + if (!IsSubloop) + U.addSiblingLoops(NL); + }; + bool Changed = IRCE.run(&L, LPMAddNewLoop); + if (!Changed) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipLoop(L)) return false; + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI); + auto LPMAddNewLoop = [&LPM](Loop *NL, bool /* IsSubLoop */) { + LPM.addLoop(*NL); + }; + return IRCE.run(L, LPMAddNewLoop); +} + +bool InductiveRangeCheckElimination::run( + Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) { if (L->getBlocks().size() >= LoopSizeCutoff) { - DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";); + LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n"); return false; } BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { - DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); + LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); return false; } LLVMContext &Context = Preheader->getContext(); SmallVector<InductiveRangeCheck, 16> RangeChecks; - ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - BranchProbabilityInfo &BPI = - getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); for (auto BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) @@ -1772,7 +1896,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { IRC.print(OS); }; - DEBUG(PrintRecognizedRangeChecks(dbgs())); + LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs())); if (PrintRangeChecks) PrintRecognizedRangeChecks(errs()); @@ -1781,8 +1905,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { Optional<LoopStructure> MaybeLoopStructure = LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason); if (!MaybeLoopStructure.hasValue()) { - DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason - << "\n";); + LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: " + << FailureReason << "\n";); return false; } LoopStructure LS = MaybeLoopStructure.getValue(); @@ -1820,9 +1944,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { if (!SafeIterRange.hasValue()) return false; - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LPM, - LS, SE, DT, SafeIterRange.getValue()); + LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, + SafeIterRange.getValue()); bool Changed = LC.run(); if (Changed) { @@ -1833,7 +1956,7 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { L->print(dbgs()); }; - DEBUG(PrintConstrainedLoopInfo()); + LLVM_DEBUG(PrintConstrainedLoopInfo()); if (PrintChangedLoops) PrintConstrainedLoopInfo(); @@ -1852,5 +1975,5 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { } Pass *llvm::createInductiveRangeCheckEliminationPass() { - return new InductiveRangeCheckElimination; + return new IRCELegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 7d66c0f73821..fbbc09eb487f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -97,6 +97,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -121,7 +122,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <cassert> #include <iterator> @@ -140,7 +140,7 @@ namespace { using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; -/// \brief InferAddressSpaces +/// InferAddressSpaces class InferAddressSpaces : public FunctionPass { /// Target specific address space which uses of should be replaced if /// possible. @@ -260,7 +260,10 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec:{ + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4)); if (!IsVolatile || !IsVolatile->isZero()) return false; @@ -289,6 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands( case Intrinsic::objectsize: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; @@ -647,13 +653,13 @@ void InferAddressSpaces::inferAddressSpaces( // Tries to update the address space of the stack top according to the // address spaces of its operands. - DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n'); + LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n'); Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace); if (!NewAS.hasValue()) continue; // If any updates are made, grabs its users to the worklist because // their address spaces can also be possibly updated. - DEBUG(dbgs() << " to " << NewAS.getValue() << '\n'); + LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n'); (*InferredAddrSpace)[V] = NewAS.getValue(); for (Value *User : V->users()) { @@ -779,7 +785,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, if (auto *MSI = dyn_cast<MemSetInst>(MI)) { B.CreateMemSet(NewV, MSI->getValue(), - MSI->getLength(), MSI->getAlignment(), + MSI->getLength(), MSI->getDestAlignment(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { @@ -795,14 +801,16 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, if (isa<MemCpyInst>(MTI)) { MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); - B.CreateMemCpy(Dest, Src, MTI->getLength(), - MTI->getAlignment(), + B.CreateMemCpy(Dest, MTI->getDestAlignment(), + Src, MTI->getSourceAlignment(), + MTI->getLength(), false, // isVolatile TBAA, TBAAStruct, ScopeMD, NoAliasMD); } else { assert(isa<MemMoveInst>(MTI)); - B.CreateMemMove(Dest, Src, MTI->getLength(), - MTI->getAlignment(), + B.CreateMemMove(Dest, MTI->getDestAlignment(), + Src, MTI->getSourceAlignment(), + MTI->getLength(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } @@ -893,15 +901,15 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces( if (NewV == nullptr) continue; - DEBUG(dbgs() << "Replacing the uses of " << *V - << "\n with\n " << *NewV << '\n'); + LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n " + << *NewV << '\n'); if (Constant *C = dyn_cast<Constant>(V)) { Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), C->getType()); if (C != Replace) { - DEBUG(dbgs() << "Inserting replacement const cast: " - << Replace << ": " << *Replace << '\n'); + LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace + << ": " << *Replace << '\n'); C->replaceAllUsesWith(Replace); V = Replace; } diff --git a/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp new file mode 100644 index 000000000000..05cd48d83267 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -0,0 +1,144 @@ +//===- InstSimplifyPass.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/InstSimplifyPass.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +#define DEBUG_TYPE "instsimplify" + +STATISTIC(NumSimplified, "Number of redundant instructions removed"); + +static bool runImpl(Function &F, const SimplifyQuery &SQ, + OptimizationRemarkEmitter *ORE) { + SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + bool Changed = false; + + do { + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { + // Here be subtlety: the iterator must be incremented before the loop + // body (not sure why), so a range-for loop won't work here. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + Instruction *I = &*BI++; + // The first time through the loop ToSimplify is empty and we try to + // simplify all instructions. On later iterations ToSimplify is not + // empty and we only bother simplifying instructions that are in it. + if (!ToSimplify->empty() && !ToSimplify->count(I)) + continue; + + // Don't waste time simplifying unused instructions. + if (!I->use_empty()) { + if (Value *V = SimplifyInstruction(I, SQ, ORE)) { + // Mark all uses for resimplification next time round the loop. + for (User *U : I->users()) + Next->insert(cast<Instruction>(U)); + I->replaceAllUsesWith(V); + ++NumSimplified; + Changed = true; + } + } + if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) { + // RecursivelyDeleteTriviallyDeadInstruction can remove more than one + // instruction, so simply incrementing the iterator does not work. + // When instructions get deleted re-iterate instead. + BI = BB->begin(); + BE = BB->end(); + Changed = true; + } + } + } + + // Place the list of instructions to simplify on the next loop iteration + // into ToSimplify. + std::swap(ToSimplify, Next); + Next->clear(); + } while (!ToSimplify->empty()); + + return Changed; +} + +namespace { +struct InstSimplifyLegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + InstSimplifyLegacyPass() : FunctionPass(ID) { + initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + } + + /// runOnFunction - Remove instructions that simplify. + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + const DominatorTree *DT = + &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + OptimizationRemarkEmitter *ORE = + &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + const DataLayout &DL = F.getParent()->getDataLayout(); + const SimplifyQuery SQ(DL, TLI, DT, AC); + return runImpl(F, SQ, ORE); + } +}; +} // namespace + +char InstSimplifyLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify", + "Remove redundant instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify", + "Remove redundant instructions", false, false) + +// Public interface to the simplify instructions pass. +FunctionPass *llvm::createInstSimplifyLegacyPass() { + return new InstSimplifyLegacyPass(); +} + +PreservedAnalyses InstSimplifyPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + const DataLayout &DL = F.getParent()->getDataLayout(); + const SimplifyQuery SQ(DL, &TLI, &DT, &AC); + bool Changed = runImpl(F, SQ, &ORE); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 2f1645433fb8..1d66472f93c8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -64,7 +65,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> @@ -131,10 +131,11 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - if (PrintLVIAfterJumpThreading) - AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LazyValueInfoWrapperPass>(); + AU.addPreserved<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -148,6 +149,7 @@ char JumpThreading::ID = 0; INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) @@ -164,7 +166,7 @@ JumpThreadingPass::JumpThreadingPass(int T) { } // Update branch probability information according to conditional -// branch probablity. This is usually made possible for cloned branches +// branch probability. This is usually made possible for cloned branches // in inline instances by the context specific profile in the caller. // For instance, // @@ -278,8 +280,12 @@ bool JumpThreading::runOnFunction(Function &F) { if (skipFunction(F)) return false; auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + // Get DT analysis before LVI. When LVI is initialized it conditionally adds + // DT if it's available. + auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DeferredDominance DDT(*DT); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; bool HasProfileData = F.hasProfileData(); @@ -289,12 +295,11 @@ bool JumpThreading::runOnFunction(Function &F) { BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI), - std::move(BPI)); + bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DDT, HasProfileData, + std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; - LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - dbgs()); + LVI->printLVI(F, *DT, dbgs()); } return Changed; } @@ -302,8 +307,12 @@ bool JumpThreading::runOnFunction(Function &F) { PreservedAnalyses JumpThreadingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + // Get DT analysis before LVI. When LVI is initialized it conditionally adds + // DT if it's available. + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LVI = AM.getResult<LazyValueAnalysis>(F); auto &AA = AM.getResult<AAManager>(F); + DeferredDominance DDT(DT); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; @@ -313,25 +322,28 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI), - std::move(BPI)); + bool Changed = runImpl(F, &TLI, &LVI, &AA, &DDT, HasProfileData, + std::move(BFI), std::move(BPI)); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<GlobalsAA>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LazyValueAnalysis>(); return PA; } bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_, AliasAnalysis *AA_, - bool HasProfileData_, + DeferredDominance *DDT_, bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_, std::unique_ptr<BranchProbabilityInfo> BPI_) { - DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); + LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = TLI_; LVI = LVI_; AA = AA_; + DDT = DDT_; BFI.reset(); BPI.reset(); // When profile data is available, we need to update edge weights after @@ -345,69 +357,66 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, BFI = std::move(BFI_); } - // Remove unreachable blocks from function as they may result in infinite - // loop. We do threading if we found something profitable. Jump threading a - // branch can create other opportunities. If these opportunities form a cycle - // i.e. if any jump threading is undoing previous threading in the path, then - // we will loop forever. We take care of this issue by not jump threading for - // back edges. This works for normal cases but not for unreachable blocks as - // they may have cycle with no back edge. - bool EverChanged = false; - EverChanged |= removeUnreachableBlocks(F, LVI); + // JumpThreading must not processes blocks unreachable from entry. It's a + // waste of compute time and can potentially lead to hangs. + SmallPtrSet<BasicBlock *, 16> Unreachable; + DominatorTree &DT = DDT->flush(); + for (auto &BB : F) + if (!DT.isReachableFromEntry(&BB)) + Unreachable.insert(&BB); FindLoopHeaders(F); + bool EverChanged = false; bool Changed; do { Changed = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = &*I; - // Thread all of the branches we can over this block. - while (ProcessBlock(BB)) + for (auto &BB : F) { + if (Unreachable.count(&BB)) + continue; + while (ProcessBlock(&BB)) // Thread all of the branches we can over BB. Changed = true; + // Stop processing BB if it's the entry or is now deleted. The following + // routines attempt to eliminate BB and locating a suitable replacement + // for the entry is non-trivial. + if (&BB == &F.getEntryBlock() || DDT->pendingDeletedBB(&BB)) + continue; - ++I; - - // If the block is trivially dead, zap it. This eliminates the successor - // edges which simplifies the CFG. - if (pred_empty(BB) && - BB != &BB->getParent()->getEntryBlock()) { - DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() - << "' with terminator: " << *BB->getTerminator() << '\n'); - LoopHeaders.erase(BB); - LVI->eraseBlock(BB); - DeleteDeadBlock(BB); + if (pred_empty(&BB)) { + // When ProcessBlock makes BB unreachable it doesn't bother to fix up + // the instructions in it. We must remove BB to prevent invalid IR. + LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName() + << "' with terminator: " << *BB.getTerminator() + << '\n'); + LoopHeaders.erase(&BB); + LVI->eraseBlock(&BB); + DeleteDeadBlock(&BB, DDT); Changed = true; continue; } - BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); - - // Can't thread an unconditional jump, but if the block is "almost - // empty", we can replace uses of it with uses of the successor and make - // this dead. - // We should not eliminate the loop header or latch either, because - // eliminating a loop header or latch might later prevent LoopSimplify - // from transforming nested loops into simplified form. We will rely on - // later passes in backend to clean up empty blocks. + // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB + // is "almost empty", we attempt to merge BB with its sole successor. + auto *BI = dyn_cast<BranchInst>(BB.getTerminator()); if (BI && BI->isUnconditional() && - BB != &BB->getParent()->getEntryBlock() && - // If the terminator is the only non-phi instruction, try to nuke it. - BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) && - !LoopHeaders.count(BI->getSuccessor(0))) { - // FIXME: It is always conservatively correct to drop the info - // for a block even if it doesn't get erased. This isn't totally - // awesome, but it allows us to use AssertingVH to prevent nasty - // dangling pointer issues within LazyValueInfo. - LVI->eraseBlock(BB); - if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) - Changed = true; + // The terminator must be the only non-phi instruction in BB. + BB.getFirstNonPHIOrDbg()->isTerminator() && + // Don't alter Loop headers and latches to ensure another pass can + // detect and transform nested loops later. + !LoopHeaders.count(&BB) && !LoopHeaders.count(BI->getSuccessor(0)) && + TryToSimplifyUncondBranchFromEmptyBlock(&BB, DDT)) { + // BB is valid for cleanup here because we passed in DDT. F remains + // BB's parent until a DDT->flush() event. + LVI->eraseBlock(&BB); + Changed = true; } } EverChanged |= Changed; } while (Changed); LoopHeaders.clear(); + DDT->flush(); + LVI->enableDT(); return EverChanged; } @@ -600,6 +609,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. // Perhaps getConstantOnEdge should be smart enough to do this? + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. @@ -613,6 +626,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( /// If I is a PHI node, then we know the incoming values for any constants. if (PHINode *PN = dyn_cast<PHINode>(I)) { + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *InVal = PN->getIncomingValue(i); if (Constant *KC = getKnownConstant(InVal, Preference)) { @@ -630,11 +647,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( } // Handle Cast instructions. Only see through Cast when the source operand is - // PHI or Cmp and the source type is i1 to save the compilation time. + // PHI or Cmp to save the compilation time. if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Source = CI->getOperand(0); - if (!Source->getType()->isIntegerTy(1)) - return false; if (!isa<PHINode>(Source) && !isa<CmpInst>(Source)) return false; ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI); @@ -738,20 +753,36 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( CmpInst::Predicate Pred = Cmp->getPredicate(); PHINode *PN = dyn_cast<PHINode>(CmpLHS); + if (!PN) + PN = dyn_cast<PHINode>(CmpRHS); if (PN && PN->getParent() == BB) { const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. // See if any do. + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = PN->getIncomingBlock(i); - Value *LHS = PN->getIncomingValue(i); - Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB); - + Value *LHS, *RHS; + if (PN == CmpLHS) { + LHS = PN->getIncomingValue(i); + RHS = CmpRHS->DoPHITranslation(BB, PredBB); + } else { + LHS = CmpLHS->DoPHITranslation(BB, PredBB); + RHS = PN->getIncomingValue(i); + } Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); if (!Res) { if (!isa<Constant>(RHS)) continue; + // getPredicateOnEdge call will make no sense if LHS is defined in BB. + auto LHSInst = dyn_cast<Instruction>(LHS); + if (LHSInst && LHSInst->getParent() == BB) + continue; + LazyValueInfo::Tristate ResT = LVI->getPredicateOnEdge(Pred, LHS, cast<Constant>(RHS), PredBB, BB, @@ -775,6 +806,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( if (!isa<Instruction>(CmpLHS) || cast<Instruction>(CmpLHS)->getParent() != BB) { + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. @@ -803,6 +838,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { if (!isa<Instruction>(AddLHS) || cast<Instruction>(AddLHS)->getParent() != BB) { + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a ConstantRange in // a predecessor, use that information to try to thread this @@ -884,6 +923,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( } // If all else fails, see if LVI can figure out a constant value for us. + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); Constant *CI = LVI->getConstant(V, BB, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) { for (BasicBlock *Pred : predecessors(BB)) @@ -903,10 +946,10 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { unsigned MinSucc = 0; BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); // Compute the successor with the minimum number of predecessors. - unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + unsigned MinNumPreds = pred_size(TestBB); for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) { TestBB = BBTerm->getSuccessor(i); - unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + unsigned NumPreds = pred_size(TestBB); if (NumPreds < MinNumPreds) { MinSucc = i; MinNumPreds = NumPreds; @@ -931,8 +974,8 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) { bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. - if (pred_empty(BB) && - BB != &BB->getParent()->getEntryBlock()) + if (DDT->pendingDeletedBB(BB) || + (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock())) return false; // If this block has a single predecessor, and if that pred has a single @@ -948,7 +991,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { LoopHeaders.insert(BB); LVI->eraseBlock(SinglePred); - MergeBasicBlockIntoOnlyPred(BB); + MergeBasicBlockIntoOnlyPred(BB, nullptr, DDT); // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by // BB code within one basic block `BB`), we need to invalidate the LVI @@ -977,9 +1020,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // Invalidate LVI information for BB if the LVI is not provably true for // all of BB. - if (any_of(*BB, [](Instruction &I) { - return !isGuaranteedToTransferExecutionToSuccessor(&I); - })) + if (!isGuaranteedToTransferExecutionToSuccessor(BB)) LVI->eraseBlock(BB); return true; } @@ -1031,18 +1072,23 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // successors to branch to. Let GetBestDestForJumpOnUndef decide. if (isa<UndefValue>(Condition)) { unsigned BestSucc = GetBestDestForJumpOnUndef(BB); + std::vector<DominatorTree::UpdateType> Updates; // Fold the branch/switch. TerminatorInst *BBTerm = BB->getTerminator(); + Updates.reserve(BBTerm->getNumSuccessors()); for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { if (i == BestSucc) continue; - BBTerm->getSuccessor(i)->removePredecessor(BB, true); + BasicBlock *Succ = BBTerm->getSuccessor(i); + Succ->removePredecessor(BB, true); + Updates.push_back({DominatorTree::Delete, BB, Succ}); } - DEBUG(dbgs() << " In block '" << BB->getName() - << "' folding undef terminator: " << *BBTerm << '\n'); + LLVM_DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding undef terminator: " << *BBTerm << '\n'); BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); BBTerm->eraseFromParent(); + DDT->applyUpdates(Updates); return true; } @@ -1050,10 +1096,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // terminator to an unconditional branch. This can occur due to threading in // other blocks. if (getKnownConstant(Condition, Preference)) { - DEBUG(dbgs() << " In block '" << BB->getName() - << "' folding terminator: " << *BB->getTerminator() << '\n'); + LLVM_DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding terminator: " << *BB->getTerminator() + << '\n'); ++NumFolds; - ConstantFoldTerminator(BB, true); + ConstantFoldTerminator(BB, true, nullptr, DDT); return true; } @@ -1080,13 +1127,18 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // threading is concerned. assert(CondBr->isConditional() && "Threading on unconditional terminator"); + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); LazyValueInfo::Tristate Ret = LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), CondConst, CondBr); if (Ret != LazyValueInfo::Unknown) { unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0; unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1; - CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); + BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove); + ToRemoveSucc->removePredecessor(BB, true); BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); CondBr->eraseFromParent(); if (CondCmp->use_empty()) @@ -1104,6 +1156,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { ConstantInt::getFalse(CondCmp->getType()); ReplaceFoldableUses(CondCmp, CI); } + DDT->deleteEdge(BB, ToRemoveSucc); return true; } @@ -1125,8 +1178,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // TODO: There are other places where load PRE would be profitable, such as // more complex comparisons. - if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue)) - if (SimplifyPartiallyRedundantLoad(LI)) + if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue)) + if (SimplifyPartiallyRedundantLoad(LoadI)) return true; // Before threading, try to propagate profile data backwards: @@ -1182,9 +1235,12 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) { Optional<bool> Implication = isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); if (Implication) { - BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB); - BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI); + BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1); + BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0); + RemoveSucc->removePredecessor(BB); + BranchInst::Create(KeepSucc, BI); BI->eraseFromParent(); + DDT->deleteEdge(BB, RemoveSucc); return true; } CurrentBB = CurrentPred; @@ -1202,17 +1258,17 @@ static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) { return false; } -/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant -/// load instruction, eliminate it by replacing it with a PHI node. This is an -/// important optimization that encourages jump threading, and needs to be run -/// interlaced with other jump threading tasks. -bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { +/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially +/// redundant load instruction, eliminate it by replacing it with a PHI node. +/// This is an important optimization that encourages jump threading, and needs +/// to be run interlaced with other jump threading tasks. +bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Don't hack volatile and ordered loads. - if (!LI->isUnordered()) return false; + if (!LoadI->isUnordered()) return false; // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. - BasicBlock *LoadBB = LI->getParent(); + BasicBlock *LoadBB = LoadI->getParent(); if (LoadBB->getSinglePredecessor()) return false; @@ -1222,7 +1278,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->isEHPad()) return false; - Value *LoadedPtr = LI->getOperand(0); + Value *LoadedPtr = LoadI->getOperand(0); // If the loaded operand is defined in the LoadBB and its not a phi, // it can't be available in predecessors. @@ -1231,26 +1287,27 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. - BasicBlock::iterator BBIt(LI); + BasicBlock::iterator BBIt(LoadI); bool IsLoadCSE; if (Value *AvailableVal = FindAvailableLoadedValue( - LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { + LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. if (IsLoadCSE) { - LoadInst *NLI = cast<LoadInst>(AvailableVal); - combineMetadataForCSE(NLI, LI); + LoadInst *NLoadI = cast<LoadInst>(AvailableVal); + combineMetadataForCSE(NLoadI, LoadI); }; // If the returned value is the load itself, replace with an undef. This can // only happen in dead loops. - if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); - if (AvailableVal->getType() != LI->getType()) - AvailableVal = - CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI); - LI->replaceAllUsesWith(AvailableVal); - LI->eraseFromParent(); + if (AvailableVal == LoadI) + AvailableVal = UndefValue::get(LoadI->getType()); + if (AvailableVal->getType() != LoadI->getType()) + AvailableVal = CastInst::CreateBitOrPointerCast( + AvailableVal, LoadI->getType(), "", LoadI); + LoadI->replaceAllUsesWith(AvailableVal); + LoadI->eraseFromParent(); return true; } @@ -1263,7 +1320,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If all of the loads and stores that feed the value have the same AA tags, // then we can propagate them onto any newly inserted loads. AAMDNodes AATags; - LI->getAAMetadata(AATags); + LoadI->getAAMetadata(AATags); SmallPtrSet<BasicBlock*, 8> PredsScanned; @@ -1285,16 +1342,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { Value *PredAvailable = nullptr; // NOTE: We don't CSE load that is volatile or anything stronger than // unordered, that should have been checked when we entered the function. - assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads"); + assert(LoadI->isUnordered() && + "Attempting to CSE volatile or atomic loads"); // If this is a load on a phi pointer, phi-translate it and search // for available load/store to the pointer in predecessors. Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB); PredAvailable = FindAvailablePtrLoadStore( - Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, - AA, &IsLoadCSE, &NumScanedInst); + Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt, + DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst); // If PredBB has a single predecessor, continue scanning through the - // single precessor. + // single predecessor. BasicBlock *SinglePredBB = PredBB; while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() && NumScanedInst < DefMaxInstsToScan) { @@ -1302,7 +1360,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (SinglePredBB) { BBIt = SinglePredBB->end(); PredAvailable = FindAvailablePtrLoadStore( - Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt, + Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt, (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, &NumScanedInst); } @@ -1334,15 +1392,15 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If the value is unavailable in one of predecessors, we will end up // inserting a new instruction into them. It is only valid if all the - // instructions before LI are guaranteed to pass execution to its successor, - // or if LI is safe to speculate. + // instructions before LoadI are guaranteed to pass execution to its + // successor, or if LoadI is safe to speculate. // TODO: If this logic becomes more complex, and we will perform PRE insertion // farther than to a predecessor, we need to reuse the code from GVN's PRE. // It requires domination tree analysis, so for this simple case it is an // overkill. if (PredsScanned.size() != AvailablePreds.size() && - !isSafeToSpeculativelyExecute(LI)) - for (auto I = LoadBB->begin(); &*I != LI; ++I) + !isSafeToSpeculativelyExecute(LoadI)) + for (auto I = LoadBB->begin(); &*I != LoadI; ++I) if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) return false; @@ -1381,11 +1439,12 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (UnavailablePred) { assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && "Can't handle critical edge here!"); - LoadInst *NewVal = new LoadInst( - LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), - LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(), - LI->getSyncScopeID(), UnavailablePred->getTerminator()); - NewVal->setDebugLoc(LI->getDebugLoc()); + LoadInst *NewVal = + new LoadInst(LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), + LoadI->getName() + ".pr", false, LoadI->getAlignment(), + LoadI->getOrdering(), LoadI->getSyncScopeID(), + UnavailablePred->getTerminator()); + NewVal->setDebugLoc(LoadI->getDebugLoc()); if (AATags) NewVal->setAAMetadata(AATags); @@ -1398,10 +1457,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); - PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", + PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "", &LoadBB->front()); - PN->takeName(LI); - PN->setDebugLoc(LI->getDebugLoc()); + PN->takeName(LoadI); + PN->setDebugLoc(LoadI->getDebugLoc()); // Insert new entries into the PHI for each predecessor. A single block may // have multiple entries here. @@ -1419,19 +1478,19 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // AvailablePreds vector as we go so that all of the PHI entries for this // predecessor use the same bitcast. Value *&PredV = I->second; - if (PredV->getType() != LI->getType()) - PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "", + if (PredV->getType() != LoadI->getType()) + PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "", P->getTerminator()); PN->addIncoming(PredV, I->first); } - for (LoadInst *PredLI : CSELoads) { - combineMetadataForCSE(PredLI, LI); + for (LoadInst *PredLoadI : CSELoads) { + combineMetadataForCSE(PredLoadI, LoadI); } - LI->replaceAllUsesWith(PN); - LI->eraseFromParent(); + LoadI->replaceAllUsesWith(PN); + LoadI->eraseFromParent(); return true; } @@ -1516,12 +1575,12 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, assert(!PredValues.empty() && "ComputeValueKnownInPredecessors returned true with no values"); - DEBUG(dbgs() << "IN BB: " << *BB; - for (const auto &PredValue : PredValues) { - dbgs() << " BB '" << BB->getName() << "': FOUND condition = " - << *PredValue.first - << " for pred '" << PredValue.second->getName() << "'.\n"; - }); + LLVM_DEBUG(dbgs() << "IN BB: " << *BB; + for (const auto &PredValue : PredValues) { + dbgs() << " BB '" << BB->getName() + << "': FOUND condition = " << *PredValue.first + << " for pred '" << PredValue.second->getName() << "'.\n"; + }); // Decide what we want to thread through. Convert our list of known values to // a list of known destinations for each pred. This also discards duplicate @@ -1591,20 +1650,24 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, // not thread. By doing so, we do not need to duplicate the current block and // also miss potential opportunities in case we dont/cant duplicate. if (OnlyDest && OnlyDest != MultipleDestSentinel) { - if (PredWithKnownDest == - (size_t)std::distance(pred_begin(BB), pred_end(BB))) { + if (PredWithKnownDest == (size_t)pred_size(BB)) { bool SeenFirstBranchToOnlyDest = false; + std::vector <DominatorTree::UpdateType> Updates; + Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1); for (BasicBlock *SuccBB : successors(BB)) { - if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) + if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) { SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. - else + } else { SuccBB->removePredecessor(BB, true); // This is unreachable successor. + Updates.push_back({DominatorTree::Delete, BB, SuccBB}); + } } // Finally update the terminator. TerminatorInst *Term = BB->getTerminator(); BranchInst::Create(OnlyDest, Term); Term->eraseFromParent(); + DDT->applyUpdates(Updates); // If the condition is now dead due to the removal of the old terminator, // erase it. @@ -1839,15 +1902,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { - DEBUG(dbgs() << " Not threading across BB '" << BB->getName() - << "' - would thread to self!\n"); + LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() + << "' - would thread to self!\n"); return false; } // If threading this would thread across a loop header, don't thread the edge. // See the comments above FindLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { - DEBUG({ + LLVM_DEBUG({ bool BBIsHeader = LoopHeaders.count(BB); bool SuccIsHeader = LoopHeaders.count(SuccBB); dbgs() << " Not threading across " @@ -1861,8 +1924,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (JumpThreadCost > BBDupThreshold) { - DEBUG(dbgs() << " Not threading BB '" << BB->getName() - << "' - Cost is too high: " << JumpThreadCost << "\n"); + LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); return false; } @@ -1871,17 +1934,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { - DEBUG(dbgs() << " Factoring out " << PredBBs.size() - << " common predecessors.\n"); + LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! - DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" - << SuccBB->getName() << "' with cost: " << JumpThreadCost - << ", across block:\n " - << *BB << "\n"); - + LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() + << "' to '" << SuccBB->getName() + << "' with cost: " << JumpThreadCost + << ", across block:\n " << *BB << "\n"); + + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); LVI->threadEdge(PredBB, BB, SuccBB); // We are going to have to map operands from the original BB block to the new @@ -1931,15 +1998,32 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // PHI nodes for NewBB now. AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + // Update the terminator of PredBB to jump to NewBB instead of BB. This + // eliminates predecessors from BB, which requires us to simplify any PHI + // nodes in BB. + TerminatorInst *PredTerm = PredBB->getTerminator(); + for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) + if (PredTerm->getSuccessor(i) == BB) { + BB->removePredecessor(PredBB, true); + PredTerm->setSuccessor(i, NewBB); + } + + // Enqueue required DT updates. + DDT->applyUpdates({{DominatorTree::Insert, NewBB, SuccBB}, + {DominatorTree::Insert, PredBB, NewBB}, + {DominatorTree::Delete, PredBB, BB}}); + // If there were values defined in BB that are used outside the block, then we // now have to update all uses of the value to use either the original value, // the cloned value, or some PHI derived value. This can require arbitrary // PHI insertion, of which we are prepared to do, clean these up now. SSAUpdater SSAUpdate; SmallVector<Use*, 16> UsesToRename; + for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if it is used outside of its - // block, and if so, record them in UsesToRename. + // Scan all uses of this instruction to see if their uses are no longer + // dominated by the previous def and if so, record them in UsesToRename. + // Also, skip phi operands from PredBB - we'll remove them anyway. for (Use &U : I.uses()) { Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { @@ -1954,8 +2038,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // If there are no uses outside the block, we're done with this instruction. if (UsesToRename.empty()) continue; - - DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks @@ -1966,19 +2049,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "\n"); } - // Ok, NewBB is good to go. Update the terminator of PredBB to jump to - // NewBB instead of BB. This eliminates predecessors from BB, which requires - // us to simplify any PHI nodes in BB. - TerminatorInst *PredTerm = PredBB->getTerminator(); - for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) - if (PredTerm->getSuccessor(i) == BB) { - BB->removePredecessor(PredBB, true); - PredTerm->setSuccessor(i, NewBB); - } - // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. @@ -1998,20 +2071,42 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, const char *Suffix) { + SmallVector<BasicBlock *, 2> NewBBs; + // Collect the frequencies of all predecessors of BB, which will be used to - // update the edge weight on BB->SuccBB. - BlockFrequency PredBBFreq(0); + // update the edge weight of the result of splitting predecessors. + DenseMap<BasicBlock *, BlockFrequency> FreqMap; if (HasProfileData) for (auto Pred : Preds) - PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + FreqMap.insert(std::make_pair( + Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB))); + + // In the case when BB is a LandingPad block we create 2 new predecessors + // instead of just one. + if (BB->isLandingPad()) { + std::string NewName = std::string(Suffix) + ".split-lp"; + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs); + } else { + NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix)); + } - BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + std::vector<DominatorTree::UpdateType> Updates; + Updates.reserve((2 * Preds.size()) + NewBBs.size()); + for (auto NewBB : NewBBs) { + BlockFrequency NewBBFreq(0); + Updates.push_back({DominatorTree::Insert, NewBB, BB}); + for (auto Pred : predecessors(NewBB)) { + Updates.push_back({DominatorTree::Delete, Pred, BB}); + Updates.push_back({DominatorTree::Insert, Pred, NewBB}); + if (HasProfileData) // Update frequencies between Pred -> NewBB. + NewBBFreq += FreqMap.lookup(Pred); + } + if (HasProfileData) // Apply the summed frequency to NewBB. + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } - // Set the block frequency of the newly created PredBB, which is the sum of - // frequencies of Preds. - if (HasProfileData) - BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); - return PredBB; + DDT->applyUpdates(Updates); + return NewBBs[0]; } bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { @@ -2140,42 +2235,49 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( // cause us to transform this into an irreducible loop, don't do this. // See the comments above FindLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB)) { - DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() - << "' into predecessor block '" << PredBBs[0]->getName() - << "' - it might create an irreducible loop!\n"); + LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() + << "' into predecessor block '" << PredBBs[0]->getName() + << "' - it might create an irreducible loop!\n"); return false; } unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (DuplicationCost > BBDupThreshold) { - DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() - << "' - Cost is too high: " << DuplicationCost << "\n"); + LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() + << "' - Cost is too high: " << DuplicationCost << "\n"); return false; } // And finally, do it! Start by factoring the predecessors if needed. + std::vector<DominatorTree::UpdateType> Updates; BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { - DEBUG(dbgs() << " Factoring out " << PredBBs.size() - << " common predecessors.\n"); + LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } + Updates.push_back({DominatorTree::Delete, PredBB, BB}); // Okay, we decided to do this! Clone all the instructions in BB onto the end // of PredBB. - DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" - << PredBB->getName() << "' to eliminate branch on phi. Cost: " - << DuplicationCost << " block is:" << *BB << "\n"); + LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName() + << "' into end of '" << PredBB->getName() + << "' to eliminate branch on phi. Cost: " + << DuplicationCost << " block is:" << *BB << "\n"); // Unless PredBB ends with an unconditional branch, split the edge so that we // can just clone the bits from BB into the end of the new PredBB. BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); if (!OldPredBranch || !OldPredBranch->isUnconditional()) { - PredBB = SplitEdge(PredBB, BB); + BasicBlock *OldPredBB = PredBB; + PredBB = SplitEdge(OldPredBB, BB); + Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB}); + Updates.push_back({DominatorTree::Insert, PredBB, BB}); + Updates.push_back({DominatorTree::Delete, OldPredBB, BB}); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } @@ -2217,6 +2319,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); PredBB->getInstList().insert(OldPredBranch->getIterator(), New); + // Update Dominance from simplified New instruction operands. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i))) + Updates.push_back({DominatorTree::Insert, PredBB, SuccBB}); } } @@ -2252,7 +2358,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( if (UsesToRename.empty()) continue; - DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks @@ -2263,7 +2369,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "\n"); } // PredBB no longer jumps to BB, remove entries in the PHI node for the edge @@ -2272,6 +2378,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( // Remove the unconditional branch at the end of the PredBB block. OldPredBranch->eraseFromParent(); + DDT->applyUpdates(Updates); ++NumDupes; return true; @@ -2314,6 +2421,10 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { // Now check if one of the select values would allow us to constant fold the // terminator in BB. We don't do the transform if both sides fold, those // cases will be threaded in any case. + if (DDT->pending()) + LVI->disableDT(); + else + LVI->enableDT(); LazyValueInfo::Tristate LHSFolds = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), CondRHS, Pred, BB, CondCmp); @@ -2344,6 +2455,8 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { // The select is now dead. SI->eraseFromParent(); + DDT->applyUpdates({{DominatorTree::Insert, NewBB, BB}, + {DominatorTree::Insert, Pred, NewBB}}); // Update any other PHI nodes in BB. for (BasicBlock::iterator BI = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(BI); ++BI) @@ -2409,7 +2522,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { break; } } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) { - // Look for a Select in BB that uses PN as condtion. + // Look for a Select in BB that uses PN as condition. if (isUnfoldCandidate(SelectI, U.get())) { SI = SelectI; break; @@ -2422,11 +2535,25 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { // Expand the select. TerminatorInst *Term = SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + BasicBlock *SplitBB = SI->getParent(); + BasicBlock *NewBB = Term->getParent(); PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); NewPN->addIncoming(SI->getFalseValue(), BB); SI->replaceAllUsesWith(NewPN); SI->eraseFromParent(); + // NewBB and SplitBB are newly created blocks which require insertion. + std::vector<DominatorTree::UpdateType> Updates; + Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3); + Updates.push_back({DominatorTree::Insert, BB, SplitBB}); + Updates.push_back({DominatorTree::Insert, BB, NewBB}); + Updates.push_back({DominatorTree::Insert, NewBB, SplitBB}); + // BB's successors were moved to SplitBB, update DDT accordingly. + for (auto *Succ : successors(SplitBB)) { + Updates.push_back({DominatorTree::Delete, BB, Succ}); + Updates.push_back({DominatorTree::Insert, SplitBB, Succ}); + } + DDT->applyUpdates(Updates); return true; } return false; @@ -2513,8 +2640,8 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, if (!TrueDestIsSafe && !FalseDestIsSafe) return false; - BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; - BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; + BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; + BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; ValueToValueMapTy UnguardedMapping, GuardedMapping; Instruction *AfterGuard = Guard->getNextNode(); @@ -2523,18 +2650,29 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, return false; // Duplicate all instructions before the guard and the guard itself to the // branch where implication is not proved. - GuardedBlock = DuplicateInstructionsInSplitBetween( - BB, GuardedBlock, AfterGuard, GuardedMapping); + BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween( + BB, PredGuardedBlock, AfterGuard, GuardedMapping); assert(GuardedBlock && "Could not create the guarded block?"); // Duplicate all instructions before the guard in the unguarded branch. // Since we have successfully duplicated the guarded block and this block // has fewer instructions, we expect it to succeed. - UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock, - Guard, UnguardedMapping); + BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween( + BB, PredUnguardedBlock, Guard, UnguardedMapping); assert(UnguardedBlock && "Could not create the unguarded block?"); - DEBUG(dbgs() << "Moved guard " << *Guard << " to block " - << GuardedBlock->getName() << "\n"); - + LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block " + << GuardedBlock->getName() << "\n"); + // DuplicateInstructionsInSplitBetween inserts a new block "BB.split" between + // PredBB and BB. We need to perform two inserts and one delete for each of + // the above calls to update Dominators. + DDT->applyUpdates( + {// Guarded block split. + {DominatorTree::Delete, PredGuardedBlock, BB}, + {DominatorTree::Insert, PredGuardedBlock, GuardedBlock}, + {DominatorTree::Insert, GuardedBlock, BB}, + // Unguarded block split. + {DominatorTree::Delete, PredUnguardedBlock, BB}, + {DominatorTree::Insert, PredUnguardedBlock, UnguardedBlock}, + {DominatorTree::Insert, UnguardedBlock, BB}}); // Some instructions before the guard may still have uses. For them, we need // to create Phi nodes merging their copies in both guarded and unguarded // branches. Those instructions that have no uses can be just removed. diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 946474fef062..ff66632f0391 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -64,7 +65,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> @@ -170,7 +170,8 @@ struct LegacyLICMPass : public LoopPass { /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); if (EnableMSSALoopDependency) AU.addRequired<MemorySSAWrapperPass>(); @@ -220,7 +221,10 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); - PA.preserveSet<CFGAnalyses>(); + + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; } @@ -392,7 +396,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // If the instruction is dead, we would try to sink it because it isn't // used in the loop, instead, just delete it. if (isInstructionTriviallyDead(&I, TLI)) { - DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); + LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); + salvageDebugInfo(I); ++II; CurAST->deleteValue(&I); I.eraseFromParent(); @@ -445,101 +450,78 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, BasicBlock *BB = DTN->getBlock(); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (!inSubLoop(BB, CurLoop, LI)) - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { - Instruction &I = *II++; - // Try constant folding this instruction. If all the operands are - // constants, it is technically hoistable, but it would be better to - // just fold it. - if (Constant *C = ConstantFoldInstruction( - &I, I.getModule()->getDataLayout(), TLI)) { - DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); - CurAST->copyValue(&I, C); - I.replaceAllUsesWith(C); - if (isInstructionTriviallyDead(&I, TLI)) { - CurAST->deleteValue(&I); - I.eraseFromParent(); - } - Changed = true; - continue; - } + if (inSubLoop(BB, CurLoop, LI)) + continue; - // Attempt to remove floating point division out of the loop by - // converting it to a reciprocal multiplication. - if (I.getOpcode() == Instruction::FDiv && - CurLoop->isLoopInvariant(I.getOperand(1)) && - I.hasAllowReciprocal()) { - auto Divisor = I.getOperand(1); - auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); - auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); - ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); - ReciprocalDivisor->insertBefore(&I); - - auto Product = - BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); - Product->setFastMathFlags(I.getFastMathFlags()); - Product->insertAfter(&I); - I.replaceAllUsesWith(Product); + // Keep track of whether the prefix of instructions visited so far are such + // that the next instruction visited is guaranteed to execute if the loop + // is entered. + bool IsMustExecute = CurLoop->getHeader() == BB; + + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { + Instruction &I = *II++; + // Try constant folding this instruction. If all the operands are + // constants, it is technically hoistable, but it would be better to + // just fold it. + if (Constant *C = ConstantFoldInstruction( + &I, I.getModule()->getDataLayout(), TLI)) { + LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C + << '\n'); + CurAST->copyValue(&I, C); + I.replaceAllUsesWith(C); + if (isInstructionTriviallyDead(&I, TLI)) { + CurAST->deleteValue(&I); I.eraseFromParent(); - - hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); - Changed = true; - continue; } + Changed = true; + continue; + } + + // Try hoisting the instruction out to the preheader. We can only do + // this if all of the operands of the instruction are loop invariant and + // if it is safe to hoist the instruction. + // + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && + (IsMustExecute || + isSafeToExecuteUnconditionally( + I, DT, CurLoop, SafetyInfo, ORE, + CurLoop->getLoopPreheader()->getTerminator()))) { + Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); + continue; + } + + // Attempt to remove floating point division out of the loop by + // converting it to a reciprocal multiplication. + if (I.getOpcode() == Instruction::FDiv && + CurLoop->isLoopInvariant(I.getOperand(1)) && + I.hasAllowReciprocal()) { + auto Divisor = I.getOperand(1); + auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); + auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); + ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); + ReciprocalDivisor->insertBefore(&I); + + auto Product = + BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); + Product->setFastMathFlags(I.getFastMathFlags()); + Product->insertAfter(&I); + I.replaceAllUsesWith(Product); + I.eraseFromParent(); - // Try hoisting the instruction out to the preheader. We can only do - // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. - // - if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && - isSafeToExecuteUnconditionally( - I, DT, CurLoop, SafetyInfo, ORE, - CurLoop->getLoopPreheader()->getTerminator())) - Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); + hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); + Changed = true; + continue; } + + if (IsMustExecute) + IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I); + } } return Changed; } -/// Computes loop safety information, checks loop body & header -/// for the possibility of may throw exception. -/// -void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { - assert(CurLoop != nullptr && "CurLoop cant be null"); - BasicBlock *Header = CurLoop->getHeader(); - // Setting default safety values. - SafetyInfo->MayThrow = false; - SafetyInfo->HeaderMayThrow = false; - // Iterate over header and compute safety info. - for (BasicBlock::iterator I = Header->begin(), E = Header->end(); - (I != E) && !SafetyInfo->HeaderMayThrow; ++I) - SafetyInfo->HeaderMayThrow |= - !isGuaranteedToTransferExecutionToSuccessor(&*I); - - SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; - // Iterate over loop instructions and compute safety info. - // Skip header as it has been computed and stored in HeaderMayThrow. - // The first block in loopinfo.Blocks is guaranteed to be the header. - assert(Header == *CurLoop->getBlocks().begin() && - "First block must be header"); - for (Loop::block_iterator BB = std::next(CurLoop->block_begin()), - BBE = CurLoop->block_end(); - (BB != BBE) && !SafetyInfo->MayThrow; ++BB) - for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); - (I != E) && !SafetyInfo->MayThrow; ++I) - SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I); - - // Compute funclet colors if we might sink/hoist in a function with a funclet - // personality routine. - Function *Fn = CurLoop->getHeader()->getParent(); - if (Fn->hasPersonalityFn()) - if (Constant *PersonalityFn = Fn->getPersonalityFn()) - if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))) - SafetyInfo->BlockColors = colorEHFunclets(*Fn); -} - // Return true if LI is invariant within scope of the loop. LI is invariant if // CurLoop is dominated by an invariant.start representing the same memory // location and size as the memory location LI loads from, and also the @@ -708,7 +690,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, /// This is true when all incoming values are that instruction. /// This pattern occurs most often with LCSSA PHI nodes. /// -static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) { +static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) { for (const Value *IncValue : PN.incoming_values()) if (IncValue != &I) return false; @@ -838,12 +820,12 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, return New; } -static Instruction *sinkThroughTriviallyReplacablePHI( +static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) { - assert(isTriviallyReplacablePHI(*TPN, *I) && - "Expect only trivially replacalbe PHI"); + assert(isTriviallyReplaceablePHI(*TPN, *I) && + "Expect only trivially replaceable PHI"); BasicBlock *ExitBlock = TPN->getParent(); Instruction *New; auto It = SunkCopies.find(ExitBlock); @@ -886,7 +868,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); // Split predecessors of the loop exit to make instructions in the loop are - // exposed to exit blocks through trivially replacable PHIs while keeping the + // exposed to exit blocks through trivially replaceable PHIs while keeping the // loop in the canonical form where each predecessor of each exit block should // be contained within the loop. For example, this will convert the loop below // from @@ -898,7 +880,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // %v2 = // br %LE, %LB1 // LE: - // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable + // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable // // to // @@ -909,10 +891,10 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // %v2 = // br %LE.split2, %LB1 // LE.split: - // %p1 = phi [%v1, %LB1] <-- trivially replacable + // %p1 = phi [%v1, %LB1] <-- trivially replaceable // br %LE // LE.split2: - // %p2 = phi [%v2, %LB2] <-- trivially replacable + // %p2 = phi [%v2, %LB2] <-- trivially replaceable // br %LE // LE: // %p = phi [%p1, %LE.split], [%p2, %LE.split2] @@ -929,8 +911,14 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // Since we do not allow splitting EH-block with BlockColors in // canSplitPredecessors(), we can simply assign predecessor's color to // the new block. - if (!BlockColors.empty()) - BlockColors[NewPred] = BlockColors[PredBB]; + if (!BlockColors.empty()) { + // Grab a reference to the ColorVector to be inserted before getting the + // reference to the vector we are copying because inserting the new + // element in BlockColors might cause the map to be reallocated. + ColorVector &ColorsForNewBlock = BlockColors[NewPred]; + ColorVector &ColorsForOldBlock = BlockColors[PredBB]; + ColorsForNewBlock = ColorsForOldBlock; + } } PredBBs.remove(PredBB); } @@ -944,7 +932,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE, bool FreeInLoop) { - DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); + LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) << "sinking " << ore::NV("Inst", &I); @@ -987,14 +975,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, } VisitedUsers.insert(PN); - if (isTriviallyReplacablePHI(*PN, I)) + if (isTriviallyReplaceablePHI(*PN, I)) continue; if (!canSplitPredecessors(PN, SafetyInfo)) return Changed; // Split predecessors of the PHI so that we can make users trivially - // replacable. + // replaceable. splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo); // Should rebuild the iterators, as they may be invalidated by @@ -1029,9 +1017,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, PHINode *PN = cast<PHINode>(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); - // The PHI must be trivially replacable. - Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies, - SafetyInfo, CurLoop); + // The PHI must be trivially replaceable. + Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies, + SafetyInfo, CurLoop); PN->replaceAllUsesWith(New); PN->eraseFromParent(); Changed = true; @@ -1046,8 +1034,8 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { auto *Preheader = CurLoop->getLoopPreheader(); - DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I - << "\n"); + LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I + << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting " << ore::NV("Inst", &I); @@ -1236,7 +1224,7 @@ bool llvm::promoteLoopAccessesToScalars( Value *SomePtr = *PointerMustAliases.begin(); BasicBlock *Preheader = CurLoop->getLoopPreheader(); - // It isn't safe to promote a load/store from the loop if the load/store is + // It is not safe to promote a load/store from the loop if the load/store is // conditional. For example, turning: // // for () { if (c) *P += 1; } @@ -1365,7 +1353,7 @@ bool llvm::promoteLoopAccessesToScalars( // If a store dominates all exit blocks, it is safe to sink. // As explained above, if an exit block was executed, a dominating - // store must have been been executed at least once, so we are not + // store must have been executed at least once, so we are not // introducing stores on paths that did not have them. // Note that this only looks at explicit exit blocks. If we ever // start sinking stores into unwind edges (see above), this will break. @@ -1427,8 +1415,8 @@ bool llvm::promoteLoopAccessesToScalars( return false; // Otherwise, this is safe to promote, lets do it! - DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr - << '\n'); + LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr + << '\n'); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0]) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 7f7c6de76450..3b41b5d96c86 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -71,7 +71,7 @@ public: private: bool runOnLoop(Loop *L); - /// \brief Check if the the stride of the accesses is large enough to + /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR); @@ -244,9 +244,9 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (ItersAhead > getMaxPrefetchIterationsAhead()) return MadeChange; - DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); + LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads; for (const auto BB : L->blocks()) { @@ -275,7 +275,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!LSCEVAddRec) continue; - // Check if the the stride of the accesses is large enough to warrant a + // Check if the stride of the accesses is large enough to warrant a // prefetch. if (!isStrideLargeEnough(LSCEVAddRec)) continue; @@ -320,8 +320,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); ++NumPrefetches; - DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV - << "\n"); + LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV + << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI) << "prefetched memory access"; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 15cd1086f209..d412025d7e94 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -142,14 +142,15 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, // of trouble. BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader || !L->hasDedicatedExits()) { - DEBUG(dbgs() - << "Deletion requires Loop with preheader and dedicated exits.\n"); + LLVM_DEBUG( + dbgs() + << "Deletion requires Loop with preheader and dedicated exits.\n"); return LoopDeletionResult::Unmodified; } // We can't remove loops that contain subloops. If the subloops were dead, // they would already have been removed in earlier executions of this pass. if (L->begin() != L->end()) { - DEBUG(dbgs() << "Loop contains subloops.\n"); + LLVM_DEBUG(dbgs() << "Loop contains subloops.\n"); return LoopDeletionResult::Unmodified; } @@ -157,7 +158,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, BasicBlock *ExitBlock = L->getUniqueExitBlock(); if (ExitBlock && isLoopNeverExecuted(L)) { - DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); + LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); // Set incoming value to undef for phi nodes in the exit block. for (PHINode &P : ExitBlock->phis()) { std::fill(P.incoming_values().begin(), P.incoming_values().end(), @@ -178,13 +179,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, // block will be branched to, or trying to preserve the branching logic in // a loop invariant manner. if (!ExitBlock) { - DEBUG(dbgs() << "Deletion requires single exit block\n"); + LLVM_DEBUG(dbgs() << "Deletion requires single exit block\n"); return LoopDeletionResult::Unmodified; } // Finally, we have to check that the loop really is dead. bool Changed = false; if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) { - DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n"); + LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n"); return Changed ? LoopDeletionResult::Modified : LoopDeletionResult::Unmodified; } @@ -193,12 +194,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, // They could be infinite, in which case we'd be changing program behavior. const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) { - DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); + LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); return Changed ? LoopDeletionResult::Modified : LoopDeletionResult::Unmodified; } - DEBUG(dbgs() << "Loop is invariant, delete it!"); + LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!"); deleteDeadLoop(L, &DT, &SE, &LI); ++NumDeleted; @@ -209,8 +210,8 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &Updater) { - DEBUG(dbgs() << "Analyzing Loop for deletion: "); - DEBUG(L.dump()); + LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: "); + LLVM_DEBUG(L.dump()); std::string LoopName = L.getName(); auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI); if (Result == LoopDeletionResult::Unmodified) @@ -255,8 +256,8 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DEBUG(dbgs() << "Analyzing Loop for deletion: "); - DEBUG(L->dump()); + LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: "); + LLVM_DEBUG(L->dump()); LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 0d7e3db901cb..06083a4f5086 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -111,7 +111,7 @@ STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { -/// \brief Maintains the set of instructions of the loop for a partition before +/// Maintains the set of instructions of the loop for a partition before /// cloning. After cloning, it hosts the new loop. class InstPartition { using InstructionSet = SmallPtrSet<Instruction *, 8>; @@ -122,20 +122,20 @@ public: Set.insert(I); } - /// \brief Returns whether this partition contains a dependence cycle. + /// Returns whether this partition contains a dependence cycle. bool hasDepCycle() const { return DepCycle; } - /// \brief Adds an instruction to this partition. + /// Adds an instruction to this partition. void add(Instruction *I) { Set.insert(I); } - /// \brief Collection accessors. + /// Collection accessors. InstructionSet::iterator begin() { return Set.begin(); } InstructionSet::iterator end() { return Set.end(); } InstructionSet::const_iterator begin() const { return Set.begin(); } InstructionSet::const_iterator end() const { return Set.end(); } bool empty() const { return Set.empty(); } - /// \brief Moves this partition into \p Other. This partition becomes empty + /// Moves this partition into \p Other. This partition becomes empty /// after this. void moveTo(InstPartition &Other) { Other.Set.insert(Set.begin(), Set.end()); @@ -143,7 +143,7 @@ public: Other.DepCycle |= DepCycle; } - /// \brief Populates the partition with a transitive closure of all the + /// Populates the partition with a transitive closure of all the /// instructions that the seeded instructions dependent on. void populateUsedSet() { // FIXME: We currently don't use control-dependence but simply include all @@ -166,7 +166,7 @@ public: } } - /// \brief Clones the original loop. + /// Clones the original loop. /// /// Updates LoopInfo and DominatorTree using the information that block \p /// LoopDomBB dominates the loop. @@ -179,27 +179,27 @@ public: return ClonedLoop; } - /// \brief The cloned loop. If this partition is mapped to the original loop, + /// The cloned loop. If this partition is mapped to the original loop, /// this is null. const Loop *getClonedLoop() const { return ClonedLoop; } - /// \brief Returns the loop where this partition ends up after distribution. + /// Returns the loop where this partition ends up after distribution. /// If this partition is mapped to the original loop then use the block from /// the loop. const Loop *getDistributedLoop() const { return ClonedLoop ? ClonedLoop : OrigLoop; } - /// \brief The VMap that is populated by cloning and then used in + /// The VMap that is populated by cloning and then used in /// remapinstruction to remap the cloned instructions. ValueToValueMapTy &getVMap() { return VMap; } - /// \brief Remaps the cloned instructions using VMap. + /// Remaps the cloned instructions using VMap. void remapInstructions() { remapInstructionsInBlocks(ClonedLoopBlocks, VMap); } - /// \brief Based on the set of instructions selected for this partition, + /// Based on the set of instructions selected for this partition, /// removes the unnecessary ones. void removeUnusedInsts() { SmallVector<Instruction *, 8> Unused; @@ -239,30 +239,30 @@ public: } private: - /// \brief Instructions from OrigLoop selected for this partition. + /// Instructions from OrigLoop selected for this partition. InstructionSet Set; - /// \brief Whether this partition contains a dependence cycle. + /// Whether this partition contains a dependence cycle. bool DepCycle; - /// \brief The original loop. + /// The original loop. Loop *OrigLoop; - /// \brief The cloned loop. If this partition is mapped to the original loop, + /// The cloned loop. If this partition is mapped to the original loop, /// this is null. Loop *ClonedLoop = nullptr; - /// \brief The blocks of ClonedLoop including the preheader. If this + /// The blocks of ClonedLoop including the preheader. If this /// partition is mapped to the original loop, this is empty. SmallVector<BasicBlock *, 8> ClonedLoopBlocks; - /// \brief These gets populated once the set of instructions have been + /// These gets populated once the set of instructions have been /// finalized. If this partition is mapped to the original loop, these are not /// set. ValueToValueMapTy VMap; }; -/// \brief Holds the set of Partitions. It populates them, merges them and then +/// Holds the set of Partitions. It populates them, merges them and then /// clones the loops. class InstPartitionContainer { using InstToPartitionIdT = DenseMap<Instruction *, int>; @@ -271,10 +271,10 @@ public: InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) : L(L), LI(LI), DT(DT) {} - /// \brief Returns the number of partitions. + /// Returns the number of partitions. unsigned getSize() const { return PartitionContainer.size(); } - /// \brief Adds \p Inst into the current partition if that is marked to + /// Adds \p Inst into the current partition if that is marked to /// contain cycles. Otherwise start a new partition for it. void addToCyclicPartition(Instruction *Inst) { // If the current partition is non-cyclic. Start a new one. @@ -284,7 +284,7 @@ public: PartitionContainer.back().add(Inst); } - /// \brief Adds \p Inst into a partition that is not marked to contain + /// Adds \p Inst into a partition that is not marked to contain /// dependence cycles. /// // Initially we isolate memory instructions into as many partitions as @@ -293,7 +293,7 @@ public: PartitionContainer.emplace_back(Inst, L); } - /// \brief Merges adjacent non-cyclic partitions. + /// Merges adjacent non-cyclic partitions. /// /// The idea is that we currently only want to isolate the non-vectorizable /// partition. We could later allow more distribution among these partition @@ -303,7 +303,7 @@ public: [](const InstPartition *P) { return !P->hasDepCycle(); }); } - /// \brief If a partition contains only conditional stores, we won't vectorize + /// If a partition contains only conditional stores, we won't vectorize /// it. Try to merge it with a previous cyclic partition. void mergeNonIfConvertible() { mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { @@ -323,14 +323,14 @@ public: }); } - /// \brief Merges the partitions according to various heuristics. + /// Merges the partitions according to various heuristics. void mergeBeforePopulating() { mergeAdjacentNonCyclic(); if (!DistributeNonIfConvertible) mergeNonIfConvertible(); } - /// \brief Merges partitions in order to ensure that no loads are duplicated. + /// Merges partitions in order to ensure that no loads are duplicated. /// /// We can't duplicate loads because that could potentially reorder them. /// LoopAccessAnalysis provides dependency information with the context that @@ -362,9 +362,11 @@ public: std::tie(LoadToPart, NewElt) = LoadToPartition.insert(std::make_pair(Inst, PartI)); if (!NewElt) { - DEBUG(dbgs() << "Merging partitions due to this load in multiple " - << "partitions: " << PartI << ", " - << LoadToPart->second << "\n" << *Inst << "\n"); + LLVM_DEBUG(dbgs() + << "Merging partitions due to this load in multiple " + << "partitions: " << PartI << ", " << LoadToPart->second + << "\n" + << *Inst << "\n"); auto PartJ = I; do { @@ -398,7 +400,7 @@ public: return true; } - /// \brief Sets up the mapping between instructions to partitions. If the + /// Sets up the mapping between instructions to partitions. If the /// instruction is duplicated across multiple partitions, set the entry to -1. void setupPartitionIdOnInstructions() { int PartitionID = 0; @@ -416,14 +418,14 @@ public: } } - /// \brief Populates the partition with everything that the seeding + /// Populates the partition with everything that the seeding /// instructions require. void populateUsedSet() { for (auto &P : PartitionContainer) P.populateUsedSet(); } - /// \brief This performs the main chunk of the work of cloning the loops for + /// This performs the main chunk of the work of cloning the loops for /// the partitions. void cloneLoops() { BasicBlock *OrigPH = L->getLoopPreheader(); @@ -470,13 +472,13 @@ public: Curr->getDistributedLoop()->getExitingBlock()); } - /// \brief Removes the dead instructions from the cloned loops. + /// Removes the dead instructions from the cloned loops. void removeUnusedInsts() { for (auto &Partition : PartitionContainer) Partition.removeUnusedInsts(); } - /// \brief For each memory pointer, it computes the partitionId the pointer is + /// For each memory pointer, it computes the partitionId the pointer is /// used in. /// /// This returns an array of int where the I-th entry corresponds to I-th @@ -543,10 +545,10 @@ public: private: using PartitionContainerT = std::list<InstPartition>; - /// \brief List of partitions. + /// List of partitions. PartitionContainerT PartitionContainer; - /// \brief Mapping from Instruction to partition Id. If the instruction + /// Mapping from Instruction to partition Id. If the instruction /// belongs to multiple partitions the entry contains -1. InstToPartitionIdT InstToPartitionId; @@ -554,7 +556,7 @@ private: LoopInfo *LI; DominatorTree *DT; - /// \brief The control structure to merge adjacent partitions if both satisfy + /// The control structure to merge adjacent partitions if both satisfy /// the \p Predicate. template <class UnaryPredicate> void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { @@ -575,7 +577,7 @@ private: } }; -/// \brief For each memory instruction, this class maintains difference of the +/// For each memory instruction, this class maintains difference of the /// number of unsafe dependences that start out from this instruction minus /// those that end here. /// @@ -602,7 +604,7 @@ public: const SmallVectorImpl<Dependence> &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); - DEBUG(dbgs() << "Backward dependences:\n"); + LLVM_DEBUG(dbgs() << "Backward dependences:\n"); for (auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program @@ -611,7 +613,7 @@ public: ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; - DEBUG(Dep.print(dbgs(), 2, Instructions)); + LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); } } @@ -619,7 +621,7 @@ private: AccessesType Accesses; }; -/// \brief The actual class performing the per-loop work. +/// The actual class performing the per-loop work. class LoopDistributeForLoop { public: LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, @@ -628,12 +630,13 @@ public: setForced(); } - /// \brief Try to distribute an inner-most loop. + /// Try to distribute an inner-most loop. bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { assert(L->empty() && "Only process inner loops."); - DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() - << "\" checking " << *L << "\n"); + LLVM_DEBUG(dbgs() << "\nLDist: In \"" + << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); if (!L->getExitBlock()) return fail("MultipleExitBlocks", "multiple exit blocks"); @@ -705,7 +708,7 @@ public: for (auto *Inst : DefsUsedOutside) Partitions.addToNewNonCyclicPartition(Inst); - DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); @@ -713,20 +716,20 @@ public: // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); - DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); // Now, populate the partitions with non-memory operations. Partitions.populateUsedSet(); - DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); + LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); // In order to preserve original lexical order for loads, keep them in the // partition that we set up in the MemoryInstructionDependences loop. if (Partitions.mergeToAvoidDuplicatedLoads()) { - DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" - << Partitions); + LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" + << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); @@ -740,7 +743,7 @@ public: return fail("TooManySCEVRuntimeChecks", "too many SCEV run-time checks needed.\n"); - DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); + LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. Partitions.setupPartitionIdOnInstructions(); @@ -759,8 +762,8 @@ public: RtPtrChecking); if (!Pred.isAlwaysTrue() || !Checks.empty()) { - DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LLVM_DEBUG(dbgs() << "\nPointers:\n"); + LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); LoopVersioning LVer(*LAI, L, LI, DT, SE, false); LVer.setAliasChecks(std::move(Checks)); LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); @@ -775,12 +778,12 @@ public: // Now, we remove the instruction from each loop that don't belong to that // partition. Partitions.removeUnusedInsts(); - DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); - DEBUG(Partitions.printBlocks()); + LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); + LLVM_DEBUG(Partitions.printBlocks()); if (LDistVerify) { LI->verify(*DT); - DT->verifyDomTree(); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } ++NumLoopsDistributed; @@ -793,12 +796,12 @@ public: return true; } - /// \brief Provide diagnostics then \return with false. + /// Provide diagnostics then \return with false. bool fail(StringRef RemarkName, StringRef Message) { LLVMContext &Ctx = F->getContext(); bool Forced = isForced().getValueOr(false); - DEBUG(dbgs() << "Skipping; " << Message << "\n"); + LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); // With Rpass-missed report that distribution failed. ORE->emit([&]() { @@ -826,7 +829,7 @@ public: return false; } - /// \brief Return if distribution forced to be enabled/disabled for the loop. + /// Return if distribution forced to be enabled/disabled for the loop. /// /// If the optional has a value, it indicates whether distribution was forced /// to be enabled (true) or disabled (false). If the optional has no value @@ -834,7 +837,7 @@ public: const Optional<bool> &isForced() const { return IsForced; } private: - /// \brief Filter out checks between pointers from the same partition. + /// Filter out checks between pointers from the same partition. /// /// \p PtrToPartition contains the partition number for pointers. Partition /// number -1 means that the pointer is used in multiple partitions. In this @@ -873,7 +876,7 @@ private: return Checks; } - /// \brief Check whether the loop metadata is forcing distribution to be + /// Check whether the loop metadata is forcing distribution to be /// enabled/disabled. void setForced() { Optional<const MDOperand *> Value = @@ -896,7 +899,7 @@ private: ScalarEvolution *SE; OptimizationRemarkEmitter *ORE; - /// \brief Indicates whether distribution is forced to be enabled/disabled for + /// Indicates whether distribution is forced to be enabled/disabled for /// the loop. /// /// If the optional has a value, it indicates whether distribution was forced @@ -939,7 +942,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, namespace { -/// \brief The pass class. +/// The pass class. class LoopDistributeLegacy : public FunctionPass { public: static char ID; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 21551f0a0825..d8692198f7a3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -37,7 +37,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -57,6 +56,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -87,8 +87,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <cassert> @@ -188,8 +188,9 @@ private: PHINode *CntPhi, Value *Var); bool recognizeAndInsertCTLZ(); void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, - PHINode *CntPhi, Value *Var, const DebugLoc DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); + PHINode *CntPhi, Value *Var, Instruction *DefX, + const DebugLoc &DL, bool ZeroCheck, + bool IsCntPhiUsedOutsideLoop); /// @} }; @@ -310,9 +311,9 @@ bool LoopIdiomRecognize::runOnCountableLoop() { SmallVector<BasicBlock *, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); - DEBUG(dbgs() << "loop-idiom Scanning: F[" - << CurLoop->getHeader()->getParent()->getName() << "] Loop %" - << CurLoop->getHeader()->getName() << "\n"); + LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; @@ -756,8 +757,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, MSIs.insert(MSI); bool NegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, - BECount, NegStride, /*IsLoopMemset=*/true); + MSI->getDestAlignment(), SplatValue, MSI, MSIs, + Ev, BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -936,8 +937,9 @@ bool LoopIdiomRecognize::processLoopStridedStore( NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); } - DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" - << " from store to: " << *Ev << " at: " << *TheStore << "\n"); + LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" + << " from store to: " << *Ev << " at: " << *TheStore + << "\n"); NewCall->setDebugLoc(TheStore->getDebugLoc()); // Okay, the memset has been formed. Zap the original store and anything that @@ -1037,16 +1039,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); - unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: - // If the load or store are atomic, then they must neccessarily be unordered + // If the load or store are atomic, then they must necessarily be unordered // by previous checks. if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); + NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), + LoadBasePtr, LI->getAlignment(), NumBytes); else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. + unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); if (Align < StoreSize) return false; @@ -1066,9 +1069,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, } NewCall->setDebugLoc(SI->getDebugLoc()); - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" - << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); + LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + << " from store ptr=" << *StoreEv << " at: " << *SI + << "\n"); // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. @@ -1084,9 +1088,9 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, bool IsLoopMemset) { if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { - DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() - << " : LIR " << (IsMemset ? "Memset" : "Memcpy") - << " avoided: multi-block top-level loop\n"); + LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() + << " : LIR " << (IsMemset ? "Memset" : "Memcpy") + << " avoided: multi-block top-level loop\n"); return true; } } @@ -1195,14 +1199,13 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, VarX1 = DefX2->getOperand(0); SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); } - if (!SubOneOp) + if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) return false; - Instruction *SubInst = cast<Instruction>(SubOneOp); - ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1)); if (!Dec || - !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || - (SubInst->getOpcode() == Instruction::Add && + !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubOneOp->getOpcode() == Instruction::Add && Dec->isMinusOne()))) { return false; } @@ -1314,7 +1317,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, return false; // step 2: detect instructions corresponding to "x.next = x >> 1" - if (!DefX || DefX->getOpcode() != Instruction::AShr) + if (!DefX || (DefX->getOpcode() != Instruction::AShr && + DefX->getOpcode() != Instruction::LShr)) return false; ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); if (!Shft || !Shft->isOne()) @@ -1372,13 +1376,13 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) - if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + if (!CurLoop->contains(cast<Instruction>(U))) { IsCntPhiUsedOutsideLoop = true; break; } bool IsCntInstUsedOutsideLoop = false; for (User *U : CntInst->users()) - if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + if (!CurLoop->contains(cast<Instruction>(U))) { IsCntInstUsedOutsideLoop = true; break; } @@ -1395,16 +1399,27 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { // parent function RunOnLoop. BasicBlock *PH = CurLoop->getLoopPreheader(); Value *InitX = PhiX->getIncomingValueForBlock(PH); - // If we check X != 0 before entering the loop we don't need a zero - // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the - // loop (if it is used we count CTLZ(X >> 1)). - if (!IsCntPhiUsedOutsideLoop) - if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) - if (BranchInst *PreCondBr = - dyn_cast<BranchInst>(PreCondBB->getTerminator())) { - if (matchCondition(PreCondBr, PH) == InitX) - ZeroCheck = true; - } + + // Make sure the initial value can't be negative otherwise the ashr in the + // loop might never reach zero which would make the loop infinite. + if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL)) + return false; + + // If we are using the count instruction outside the loop, make sure we + // have a zero check as a precondition. Without the check the loop would run + // one iteration for before any check of the input value. This means 0 and 1 + // would have identical behavior in the original loop and thus + if (!IsCntPhiUsedOutsideLoop) { + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + if (!PreCondBI) + return false; + if (matchCondition(PreCondBI, PH) != InitX) + return false; + ZeroCheck = true; + } // Check if CTLZ intrinsic is profitable. Assume it is always profitable // if we delete the loop (the loop has only 6 instructions): @@ -1415,17 +1430,16 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { // %inc = add nsw %i.0, 1 // br i1 %tobool - IRBuilder<> Builder(PH->getTerminator()); - SmallVector<const Value *, 2> Ops = - {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; - ArrayRef<const Value *> Args(Ops); + const Value *Args[] = + {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) + : ConstantInt::getFalse(InitX->getContext())}; if (CurLoop->getHeader()->size() != 6 && TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > TargetTransformInfo::TCC_Basic) return false; - const DebugLoc DL = DefX->getDebugLoc(); - transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, + transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX, + DefX->getDebugLoc(), ZeroCheck, IsCntPhiUsedOutsideLoop); return true; } @@ -1461,7 +1475,7 @@ bool LoopIdiomRecognize::recognizePopcount() { if (!EntryBI || EntryBI->isConditional()) return false; - // It should have a precondition block where the generated popcount instrinsic + // It should have a precondition block where the generated popcount intrinsic // function can be inserted. auto *PreCondBB = PH->getSinglePredecessor(); if (!PreCondBB) @@ -1539,8 +1553,9 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, - const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { - BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator()); + Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, + bool IsCntPhiUsedOutsideLoop) { + BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); // Step 1: Insert the CTLZ instruction at the end of the preheader block // Count = BitWidth - CTLZ(InitX); @@ -1550,10 +1565,16 @@ void LoopIdiomRecognize::transformLoopToCountable( Builder.SetCurrentDebugLocation(DL); Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; - if (IsCntPhiUsedOutsideLoop) - InitXNext = Builder.CreateAShr(InitX, - ConstantInt::get(InitX->getType(), 1)); - else + if (IsCntPhiUsedOutsideLoop) { + if (DefX->getOpcode() == Instruction::AShr) + InitXNext = + Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::LShr) + InitXNext = + Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else + llvm_unreachable("Unexpected opcode!"); + } else InitXNext = InitX; CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); Count = Builder.CreateSub( @@ -1588,7 +1609,7 @@ void LoopIdiomRecognize::transformLoopToCountable( // ... // Br: loop if (Dec != 0) BasicBlock *Body = *(CurLoop->block_begin()); - auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + auto *LbBr = cast<BranchInst>(Body->getTerminator()); ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); Type *Ty = Count->getType(); @@ -1625,8 +1646,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var) { BasicBlock *PreHead = CurLoop->getLoopPreheader(); - auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); - const DebugLoc DL = CntInst->getDebugLoc(); + auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator()); + const DebugLoc &DL = CntInst->getDebugLoc(); // Assuming before transformation, the loop is following: // if (x) // the precondition @@ -1675,7 +1696,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, } // Step 3: Note that the population count is exactly the trip count of the - // loop in question, which enable us to to convert the loop from noncountable + // loop in question, which enable us to convert the loop from noncountable // loop into a countable one. The benefit is twofold: // // - If the loop only counts population, the entire loop becomes dead after @@ -1696,7 +1717,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, // do { cnt++; x &= x-1; t--) } while (t > 0); BasicBlock *Body = *(CurLoop->block_begin()); { - auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + auto *LbBr = cast<BranchInst>(Body->getTerminator()); ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); Type *Ty = TripCnt->getType(); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 40d468a084d4..71859efbf4bd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -20,8 +20,10 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" @@ -34,7 +36,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <utility> @@ -45,118 +46,116 @@ using namespace llvm; STATISTIC(NumSimplified, "Number of redundant instructions simplified"); -static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, - AssumptionCache *AC, - const TargetLibraryInfo *TLI) { - SmallVector<BasicBlock *, 8> ExitBlocks; - L->getUniqueExitBlocks(ExitBlocks); - array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); - +static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, + const TargetLibraryInfo &TLI) { + const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + SimplifyQuery SQ(DL, &TLI, &DT, &AC); + + // On the first pass over the loop body we try to simplify every instruction. + // On subsequent passes, we can restrict this to only simplifying instructions + // where the inputs have been updated. We end up needing two sets: one + // containing the instructions we are simplifying in *this* pass, and one for + // the instructions we will want to simplify in the *next* pass. We use + // pointers so we can swap between two stably allocated sets. SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; - // The bit we are stealing from the pointer represents whether this basic - // block is the header of a subloop, in which case we only process its phis. - using WorklistItem = PointerIntPair<BasicBlock *, 1>; - SmallVector<WorklistItem, 16> VisitStack; - SmallPtrSet<BasicBlock *, 32> Visited; - - bool Changed = false; - bool LocalChanged; - do { - LocalChanged = false; - - VisitStack.clear(); - Visited.clear(); + // Track the PHI nodes that have already been visited during each iteration so + // that we can identify when it is necessary to iterate. + SmallPtrSet<PHINode *, 4> VisitedPHIs; - VisitStack.push_back(WorklistItem(L->getHeader(), false)); + // While simplifying we may discover dead code or cause code to become dead. + // Keep track of all such instructions and we will delete them at the end. + SmallVector<Instruction *, 8> DeadInsts; - while (!VisitStack.empty()) { - WorklistItem Item = VisitStack.pop_back_val(); - BasicBlock *BB = Item.getPointer(); - bool IsSubloopHeader = Item.getInt(); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + // First we want to create an RPO traversal of the loop body. By processing in + // RPO we can ensure that definitions are processed prior to uses (for non PHI + // uses) in all cases. This ensures we maximize the simplifications in each + // iteration over the loop and minimizes the possible causes for continuing to + // iterate. + LoopBlocksRPO RPOT(&L); + RPOT.perform(&LI); - // Simplify instructions in the current basic block. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = &*BI++; - - // The first time through the loop ToSimplify is empty and we try to - // simplify all instructions. On later iterations ToSimplify is not - // empty and we only bother simplifying instructions that are in it. - if (!ToSimplify->empty() && !ToSimplify->count(I)) + bool Changed = false; + for (;;) { + for (BasicBlock *BB : RPOT) { + for (Instruction &I : *BB) { + if (auto *PI = dyn_cast<PHINode>(&I)) + VisitedPHIs.insert(PI); + + if (I.use_empty()) { + if (isInstructionTriviallyDead(&I, &TLI)) + DeadInsts.push_back(&I); continue; - - // Don't bother simplifying unused instructions. - if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC}); - if (V && LI->replacementPreservesLCSSAForm(I, V)) { - // Mark all uses for resimplification next time round the loop. - for (User *U : I->users()) - Next->insert(cast<Instruction>(U)); - - I->replaceAllUsesWith(V); - LocalChanged = true; - ++NumSimplified; - } - } - if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) { - // RecursivelyDeleteTriviallyDeadInstruction can remove more than one - // instruction, so simply incrementing the iterator does not work. - // When instructions get deleted re-iterate instead. - BI = BB->begin(); - BE = BB->end(); - LocalChanged = true; } - if (IsSubloopHeader && !isa<PHINode>(I)) - break; - } + // We special case the first iteration which we can detect due to the + // empty `ToSimplify` set. + bool IsFirstIteration = ToSimplify->empty(); - // Add all successors to the worklist, except for loop exit blocks and the - // bodies of subloops. We visit the headers of loops so that we can - // process - // their phis, but we contract the rest of the subloop body and only - // follow - // edges leading back to the original loop. - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; - ++SI) { - BasicBlock *SuccBB = *SI; - if (!Visited.insert(SuccBB).second) + if (!IsFirstIteration && !ToSimplify->count(&I)) continue; - const Loop *SuccLoop = LI->getLoopFor(SuccBB); - if (SuccLoop && SuccLoop->getHeader() == SuccBB && - L->contains(SuccLoop)) { - VisitStack.push_back(WorklistItem(SuccBB, true)); - - SmallVector<BasicBlock *, 8> SubLoopExitBlocks; - SuccLoop->getExitBlocks(SubLoopExitBlocks); - - for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { - BasicBlock *ExitBB = SubLoopExitBlocks[i]; - if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second) - VisitStack.push_back(WorklistItem(ExitBB, false)); - } - + Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I)); + if (!V || !LI.replacementPreservesLCSSAForm(&I, V)) continue; - } - bool IsExitBlock = - std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB); - if (IsExitBlock) - continue; + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); + UI != UE;) { + Use &U = *UI++; + auto *UserI = cast<Instruction>(U.getUser()); + U.set(V); + + // If the instruction is used by a PHI node we have already processed + // we'll need to iterate on the loop body to converge, so add it to + // the next set. + if (auto *UserPI = dyn_cast<PHINode>(UserI)) + if (VisitedPHIs.count(UserPI)) { + Next->insert(UserPI); + continue; + } + + // If we are only simplifying targeted instructions and the user is an + // instruction in the loop body, add it to our set of targeted + // instructions. Because we process defs before uses (outside of PHIs) + // we won't have visited it yet. + // + // We also skip any uses outside of the loop being simplified. Those + // should always be PHI nodes due to LCSSA form, and we don't want to + // try to simplify those away. + assert((L.contains(UserI) || isa<PHINode>(UserI)) && + "Uses outside the loop should be PHI nodes due to LCSSA!"); + if (!IsFirstIteration && L.contains(UserI)) + ToSimplify->insert(UserI); + } - VisitStack.push_back(WorklistItem(SuccBB, false)); + assert(I.use_empty() && "Should always have replaced all uses!"); + if (isInstructionTriviallyDead(&I, &TLI)) + DeadInsts.push_back(&I); + ++NumSimplified; + Changed = true; } } - // Place the list of instructions to simplify on the next loop iteration - // into ToSimplify. - std::swap(ToSimplify, Next); - Next->clear(); + // Delete any dead instructions found thus far now that we've finished an + // iteration over all instructions in all the loop blocks. + if (!DeadInsts.empty()) { + Changed = true; + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI); + } + + // If we never found a PHI that needs to be simplified in the next + // iteration, we're done. + if (Next->empty()) + break; - Changed |= LocalChanged; - } while (LocalChanged); + // Otherwise, put the next set in place for the next iteration and reset it + // and the visited PHIs for that iteration. + std::swap(Next, ToSimplify); + Next->clear(); + VisitedPHIs.clear(); + DeadInsts.clear(); + } return Changed; } @@ -174,21 +173,20 @@ public: bool runOnLoop(Loop *L, LPPassManager &LPM) override { if (skipLoop(L)) return false; - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AssumptionCache *AC = - &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - return SimplifyLoopInst(L, DT, LI, AC, TLI); + return simplifyLoopInst(*L, DT, LI, AC, TLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.setPreservesCFG(); getLoopAnalysisUsage(AU); @@ -200,7 +198,7 @@ public: PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { - if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI)) + if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 4f8dafef230a..2978165ed8a9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/DependenceAnalysis.h" @@ -40,6 +41,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> @@ -50,6 +52,8 @@ using namespace llvm; #define DEBUG_TYPE "loop-interchange" +STATISTIC(LoopsInterchanged, "Number of loops interchanged"); + static cl::opt<int> LoopInterchangeCostThreshold( "loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number")); @@ -73,8 +77,8 @@ static const unsigned MaxLoopNestDepth = 10; static void printDepMatrix(CharMatrix &DepMatrix) { for (auto &Row : DepMatrix) { for (auto D : Row) - DEBUG(dbgs() << D << " "); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << D << " "); + LLVM_DEBUG(dbgs() << "\n"); } } #endif @@ -103,8 +107,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } } - DEBUG(dbgs() << "Found " << MemInstr.size() - << " Loads and Stores to analyze\n"); + LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and Stores to analyze\n"); ValueVector::iterator I, IE, J, JE; @@ -121,11 +125,11 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, // Track Output, Flow, and Anti dependencies. if (auto D = DI->depends(Src, Dst, true)) { assert(D->isOrdered() && "Expected an output, flow or anti dep."); - DEBUG(StringRef DepType = - D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; - dbgs() << "Found " << DepType - << " dependency between Src and Dst\n" - << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); + LLVM_DEBUG(StringRef DepType = + D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; + dbgs() << "Found " << DepType + << " dependency between Src and Dst\n" + << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); unsigned Levels = D->getLevels(); char Direction; for (unsigned II = 1; II <= Levels; ++II) { @@ -165,17 +169,14 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, DepMatrix.push_back(Dep); if (DepMatrix.size() > MaxMemInstrCount) { - DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount - << " dependencies inside loop\n"); + LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount + << " dependencies inside loop\n"); return false; } } } } - // We don't have a DepMatrix to check legality return false. - if (DepMatrix.empty()) - return false; return true; } @@ -271,9 +272,9 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, } static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { - DEBUG(dbgs() << "Calling populateWorklist on Func: " - << L.getHeader()->getParent()->getName() << " Loop: %" - << L.getHeader()->getName() << '\n'); + LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " + << L.getHeader()->getParent()->getName() << " Loop: %" + << L.getHeader()->getName() << '\n'); LoopVector LoopList; Loop *CurrentLoop = &L; const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); @@ -404,7 +405,9 @@ public: /// Interchange OuterLoop and InnerLoop. bool transform(); - void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); + void restructureLoops(Loop *NewInner, Loop *NewOuter, + BasicBlock *OrigInnerPreHeader, + BasicBlock *OrigOuterPreHeader); void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); private: @@ -453,6 +456,9 @@ struct LoopInterchange : public FunctionPass { AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); } bool runOnFunction(Function &F) override { @@ -462,8 +468,7 @@ struct LoopInterchange : public FunctionPass { SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); @@ -473,7 +478,7 @@ struct LoopInterchange : public FunctionPass { for (Loop *L : *LI) populateWorklist(*L, Worklist); - DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + LLVM_DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); bool Changed = true; while (!Worklist.empty()) { LoopVector LoopList = Worklist.pop_back_val(); @@ -486,15 +491,15 @@ struct LoopInterchange : public FunctionPass { for (Loop *L : LoopList) { const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); if (ExitCountOuter == SE->getCouldNotCompute()) { - DEBUG(dbgs() << "Couldn't compute backedge count\n"); + LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); return false; } if (L->getNumBackEdges() != 1) { - DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); + LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); return false; } if (!L->getExitingBlock()) { - DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); + LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); return false; } } @@ -511,53 +516,38 @@ struct LoopInterchange : public FunctionPass { bool Changed = false; unsigned LoopNestDepth = LoopList.size(); if (LoopNestDepth < 2) { - DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); + LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); return false; } if (LoopNestDepth > MaxLoopNestDepth) { - DEBUG(dbgs() << "Cannot handle loops of depth greater than " - << MaxLoopNestDepth << "\n"); + LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " + << MaxLoopNestDepth << "\n"); return false; } if (!isComputableLoopNest(LoopList)) { - DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); + LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); return false; } - DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n"); + LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth + << "\n"); CharMatrix DependencyMatrix; Loop *OuterMostLoop = *(LoopList.begin()); if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, OuterMostLoop, DI)) { - DEBUG(dbgs() << "Populating dependency matrix failed\n"); + LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); return false; } #ifdef DUMP_DEP_MATRICIES - DEBUG(dbgs() << "Dependence before interchange\n"); + LLVM_DEBUG(dbgs() << "Dependence before interchange\n"); printDepMatrix(DependencyMatrix); #endif - BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); - BranchInst *OuterMostLoopLatchBI = - dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); - if (!OuterMostLoopLatchBI) - return false; - - // Since we currently do not handle LCSSA PHI's any failure in loop - // condition will now branch to LoopNestExit. - // TODO: This should be removed once we handle LCSSA PHI nodes. - // Get the Outermost loop exit. - BasicBlock *LoopNestExit; - if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) - LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); - else - LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); - - if (isa<PHINode>(LoopNestExit->begin())) { - DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " - "since on failure all loops branch to loop nest exit.\n"); + BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); + if (!LoopNestExit) { + LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block"); return false; } @@ -573,9 +563,8 @@ struct LoopInterchange : public FunctionPass { // Update the DependencyMatrix interChangeDependencies(DependencyMatrix, i, i - 1); - DT->recalculate(F); #ifdef DUMP_DEP_MATRICIES - DEBUG(dbgs() << "Dependence after interchange\n"); + LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); printDepMatrix(DependencyMatrix); #endif Changed |= Interchanged; @@ -586,21 +575,21 @@ struct LoopInterchange : public FunctionPass { bool processLoop(LoopVector LoopList, unsigned InnerLoopId, unsigned OuterLoopId, BasicBlock *LoopNestExit, std::vector<std::vector<char>> &DependencyMatrix) { - DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId - << " and OuterLoopId = " << OuterLoopId << "\n"); + LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId + << " and OuterLoopId = " << OuterLoopId << "\n"); Loop *InnerLoop = LoopList[InnerLoopId]; Loop *OuterLoop = LoopList[OuterLoopId]; LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, PreserveLCSSA, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { - DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); return false; } - DEBUG(dbgs() << "Loops are legal to interchange\n"); + LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { - DEBUG(dbgs() << "Interchanging loops not profitable\n"); + LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -614,7 +603,8 @@ struct LoopInterchange : public FunctionPass { LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); - DEBUG(dbgs() << "Loops interchanged\n"); + LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); + LoopsInterchanged++; return true; } }; @@ -631,13 +621,13 @@ bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( BasicBlock *BB) { - for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Load corresponding to reduction PHI's are safe while concluding if // tightly nested. - if (LoadInst *L = dyn_cast<LoadInst>(I)) { + if (LoadInst *L = dyn_cast<LoadInst>(&I)) { if (!areAllUsesReductions(L, InnerLoop)) return true; - } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + } else if (I.mayHaveSideEffects() || I.mayReadFromMemory()) return true; } return false; @@ -645,13 +635,13 @@ bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( BasicBlock *BB) { - for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Stores corresponding to reductions are safe while concluding if tightly // nested. - if (StoreInst *L = dyn_cast<StoreInst>(I)) { + if (StoreInst *L = dyn_cast<StoreInst>(&I)) { if (!isa<PHINode>(L->getOperand(0))) return true; - } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + } else if (I.mayHaveSideEffects() || I.mayReadFromMemory()) return true; } return false; @@ -662,7 +652,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); - DEBUG(dbgs() << "Checking if loops are tightly nested\n"); + LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n"); // A perfectly nested loop will not have any branch in between the outer and // inner block i.e. outer header will branch to either inner preheader and @@ -676,14 +666,14 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch) return false; - DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); + LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); // We do not have any basic block in between now make sure the outer header // and outer loop latch doesn't contain any unsafe instructions. if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || containsUnsafeInstructionsInLatch(OuterLoopLatch)) return false; - DEBUG(dbgs() << "Loops are perfectly nested\n"); + LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); // We have a perfect loop nest. return true; } @@ -717,16 +707,15 @@ bool LoopInterchangeLegality::findInductionAndReductions( SmallVector<PHINode *, 8> &Reductions) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + for (PHINode &PHI : L->getHeader()->phis()) { RecurrenceDescriptor RD; InductionDescriptor ID; - PHINode *PHI = cast<PHINode>(I); - if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) - Inductions.push_back(PHI); - else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) - Reductions.push_back(PHI); + if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) + Inductions.push_back(&PHI); + else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) + Reductions.push_back(&PHI); else { - DEBUG( + LLVM_DEBUG( dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); return false; } @@ -735,12 +724,11 @@ bool LoopInterchangeLegality::findInductionAndReductions( } static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { - for (auto I = Block->begin(); isa<PHINode>(I); ++I) { - PHINode *PHI = cast<PHINode>(I); + for (PHINode &PHI : Block->phis()) { // Reduction lcssa phi will have only 1 incoming block that from loop latch. - if (PHI->getNumIncomingValues() > 1) + if (PHI.getNumIncomingValues() > 1) return false; - Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); + Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0)); if (!Ins) return false; // Incoming value for lcssa phi's in outer loop exit can only be inner loop @@ -751,35 +739,38 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { return true; } -static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, - BasicBlock *LoopHeader) { - if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { - assert(BI->getNumSuccessors() == 2 && - "Branch leaving loop latch must have 2 successors"); - for (BasicBlock *Succ : BI->successors()) { - if (Succ == LoopHeader) - continue; - return Succ; - } - } - return nullptr; -} - // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); - BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); - BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + + // transform currently expects the loop latches to also be the exiting + // blocks. + if (InnerLoop->getExitingBlock() != InnerLoopLatch || + OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || + !isa<BranchInst>(InnerLoopLatch->getTerminator()) || + !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { + LLVM_DEBUG( + dbgs() << "Loops where the latch is not the exiting block are not" + << " supported currently.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Loops where the latch is not the exiting block cannot be" + " interchange currently."; + }); + return true; + } PHINode *InnerInductionVar; SmallVector<PHINode *, 8> Inductions; SmallVector<PHINode *, 8> Reductions; if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { - DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " - << "are supported currently.\n"); + LLVM_DEBUG( + dbgs() << "Only inner loops with induction or reduction PHI nodes " + << "are supported currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", InnerLoop->getStartLoc(), @@ -792,8 +783,9 @@ bool LoopInterchangeLegality::currentLimitations() { // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - DEBUG(dbgs() << "We currently only support loops with 1 induction variable." - << "Failed to interchange due to current limitation\n"); + LLVM_DEBUG( + dbgs() << "We currently only support loops with 1 induction variable." + << "Failed to interchange due to current limitation\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", InnerLoop->getStartLoc(), @@ -809,8 +801,9 @@ bool LoopInterchangeLegality::currentLimitations() { InnerInductionVar = Inductions.pop_back_val(); Reductions.clear(); if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { - DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " - << "are supported currently.\n"); + LLVM_DEBUG( + dbgs() << "Only outer loops with induction or reduction PHI nodes " + << "are supported currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", OuterLoop->getStartLoc(), @@ -824,8 +817,8 @@ bool LoopInterchangeLegality::currentLimitations() { // Outer loop cannot have reduction because then loops will not be tightly // nested. if (!Reductions.empty()) { - DEBUG(dbgs() << "Outer loops with reductions are not supported " - << "currently.\n"); + LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported " + << "currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", OuterLoop->getStartLoc(), @@ -837,8 +830,8 @@ bool LoopInterchangeLegality::currentLimitations() { } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { - DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); + LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " + << "supported currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", OuterLoop->getStartLoc(), @@ -851,7 +844,7 @@ bool LoopInterchangeLegality::currentLimitations() { // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { - DEBUG(dbgs() << "Loop structure not understood by pass\n"); + LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", InnerLoop->getStartLoc(), @@ -862,23 +855,10 @@ bool LoopInterchangeLegality::currentLimitations() { } // TODO: We only handle LCSSA PHI's corresponding to reduction for now. - BasicBlock *LoopExitBlock = - getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); - if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { - DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with LCSSA PHIs can be interchange " - "currently."; - }); - return true; - } - - LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); - if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { - DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); + BasicBlock *InnerExit = InnerLoop->getExitBlock(); + if (!containsSafePHI(InnerExit, false)) { + LLVM_DEBUG( + dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", InnerLoop->getStartLoc(), @@ -908,8 +888,9 @@ bool LoopInterchangeLegality::currentLimitations() { dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); if (!InnerIndexVarInc) { - DEBUG(dbgs() << "Did not find an instruction to increment the induction " - << "variable.\n"); + LLVM_DEBUG( + dbgs() << "Did not find an instruction to increment the induction " + << "variable.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", InnerLoop->getStartLoc(), @@ -924,7 +905,8 @@ bool LoopInterchangeLegality::currentLimitations() { // instruction. bool FoundInduction = false; - for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) { + for (const Instruction &I : + llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || isa<ZExtInst>(I)) continue; @@ -932,8 +914,8 @@ bool LoopInterchangeLegality::currentLimitations() { // We found an instruction. If this is not induction variable then it is not // safe to split this loop latch. if (!I.isIdenticalTo(InnerIndexVarInc)) { - DEBUG(dbgs() << "Found unsupported instructions between induction " - << "variable increment and branch.\n"); + LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction " + << "variable increment and branch.\n"); ORE->emit([&]() { return OptimizationRemarkMissed( DEBUG_TYPE, "UnsupportedInsBetweenInduction", @@ -950,7 +932,7 @@ bool LoopInterchangeLegality::currentLimitations() { // The loop latch ended and we didn't find the induction variable return as // current limitation. if (!FoundInduction) { - DEBUG(dbgs() << "Did not find the induction variable.\n"); + LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", InnerLoop->getStartLoc(), @@ -962,13 +944,50 @@ bool LoopInterchangeLegality::currentLimitations() { return false; } +// We currently support LCSSA PHI nodes in the outer loop exit, if their +// incoming values do not come from the outer loop latch or if the +// outer loop latch has a single predecessor. In that case, the value will +// be available if both the inner and outer loop conditions are true, which +// will still be true after interchanging. If we have multiple predecessor, +// that may not be the case, e.g. because the outer loop latch may be executed +// if the inner loop is not executed. +static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); + for (PHINode &PHI : LoopNestExit->phis()) { + // FIXME: We currently are not able to detect floating point reductions + // and have to use floating point PHIs as a proxy to prevent + // interchanging in the presence of floating point reductions. + if (PHI.getType()->isFloatingPointTy()) + return false; + for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { + Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; + + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; + } + } + return true; +} + bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { - DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId - << " and OuterLoopId = " << OuterLoopId - << " due to dependence\n"); + LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId + << " and OuterLoopId = " << OuterLoopId + << " due to dependence\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", InnerLoop->getStartLoc(), @@ -977,16 +996,23 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, }); return false; } - // Check if outer and inner loop contain legal instructions only. for (auto *BB : OuterLoop->blocks()) - for (Instruction &I : *BB) + for (Instruction &I : BB->instructionsWithoutDebug()) if (CallInst *CI = dyn_cast<CallInst>(&I)) { // readnone functions do not prevent interchanging. if (CI->doesNotReadMemory()) continue; - DEBUG(dbgs() << "Loops with call instructions cannot be interchanged " - << "safely."); + LLVM_DEBUG( + dbgs() << "Loops with call instructions cannot be interchanged " + << "safely."); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", + CI->getDebugLoc(), + CI->getParent()) + << "Cannot interchange loops due to call instruction."; + }); + return false; } @@ -1015,13 +1041,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, // TODO: The loops could not be interchanged due to current limitations in the // transform module. if (currentLimitations()) { - DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n"); return false; } // Check if the loops are tightly nested. if (!tightlyNested(OuterLoop, InnerLoop)) { - DEBUG(dbgs() << "Loops not tightly nested\n"); + LLVM_DEBUG(dbgs() << "Loops not tightly nested\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", InnerLoop->getStartLoc(), @@ -1032,6 +1058,17 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } + if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Found unsupported PHI node in loop exit."; + }); + return false; + } + return true; } @@ -1100,7 +1137,8 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, } // If outer loop has dependence and inner loop is loop independent then it is // profitable to interchange to enable parallelism. - return true; + // If there are no dependences, interchanging will not improve anything. + return !DepMatrix.empty(); } bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, @@ -1115,7 +1153,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, // of induction variables in the instruction and allows reordering if number // of bad orders is more than good. int Cost = getInstrOrderCost(); - DEBUG(dbgs() << "Cost = " << Cost << "\n"); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); if (Cost < -LoopInterchangeCostThreshold) return true; @@ -1138,33 +1176,88 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, Loop *InnerLoop) { - for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; - ++I) { - if (*I == InnerLoop) { - OuterLoop->removeChildLoop(I); + for (Loop *L : *OuterLoop) + if (L == InnerLoop) { + OuterLoop->removeChildLoop(L); return; } - } llvm_unreachable("Couldn't find loop"); } -void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, - Loop *OuterLoop) { +/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the +/// new inner and outer loop after interchanging: NewInner is the original +/// outer loop and NewOuter is the original inner loop. +/// +/// Before interchanging, we have the following structure +/// Outer preheader +// Outer header +// Inner preheader +// Inner header +// Inner body +// Inner latch +// outer bbs +// Outer latch +// +// After interchanging: +// Inner preheader +// Inner header +// Outer preheader +// Outer header +// Inner body +// outer bbs +// Outer latch +// Inner latch +void LoopInterchangeTransform::restructureLoops( + Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, + BasicBlock *OrigOuterPreHeader) { Loop *OuterLoopParent = OuterLoop->getParentLoop(); + // The original inner loop preheader moves from the new inner loop to + // the parent loop, if there is one. + NewInner->removeBlockFromLoop(OrigInnerPreHeader); + LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); + + // Switch the loop levels. if (OuterLoopParent) { // Remove the loop from its parent loop. - removeChildLoop(OuterLoopParent, OuterLoop); - removeChildLoop(OuterLoop, InnerLoop); - OuterLoopParent->addChildLoop(InnerLoop); + removeChildLoop(OuterLoopParent, NewInner); + removeChildLoop(NewInner, NewOuter); + OuterLoopParent->addChildLoop(NewOuter); } else { - removeChildLoop(OuterLoop, InnerLoop); - LI->changeTopLevelLoop(OuterLoop, InnerLoop); + removeChildLoop(NewInner, NewOuter); + LI->changeTopLevelLoop(NewInner, NewOuter); + } + while (!NewOuter->empty()) + NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); + NewOuter->addChildLoop(NewInner); + + // BBs from the original inner loop. + SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); + + // Add BBs from the original outer loop to the original inner loop (excluding + // BBs already in inner loop) + for (BasicBlock *BB : NewInner->blocks()) + if (LI->getLoopFor(BB) == NewInner) + NewOuter->addBlockEntry(BB); + + // Now remove inner loop header and latch from the new inner loop and move + // other BBs (the loop body) to the new inner loop. + BasicBlock *OuterHeader = NewOuter->getHeader(); + BasicBlock *OuterLatch = NewOuter->getLoopLatch(); + for (BasicBlock *BB : OrigInnerBBs) { + // Nothing will change for BBs in child loops. + if (LI->getLoopFor(BB) != NewOuter) + continue; + // Remove the new outer loop header and latch from the new inner loop. + if (BB == OuterHeader || BB == OuterLatch) + NewInner->removeBlockFromLoop(BB); + else + LI->changeLoopFor(BB, NewInner); } - while (!InnerLoop->empty()) - OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); - - InnerLoop->addChildLoop(OuterLoop); + // The preheader of the original outer loop becomes part of the new + // outer loop. + NewOuter->addBlockEntry(OrigOuterPreHeader); + LI->changeLoopFor(OrigOuterPreHeader, NewOuter); } bool LoopInterchangeTransform::transform() { @@ -1173,10 +1266,10 @@ bool LoopInterchangeTransform::transform() { if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - DEBUG(dbgs() << "Calling Split Inner Loop\n"); + LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n"); PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); if (!InductionPHI) { - DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); return false; } @@ -1185,8 +1278,7 @@ bool LoopInterchangeTransform::transform() { else InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); - // Ensure that InductionPHI is the first Phi node as required by - // splitInnerLoopHeader + // Ensure that InductionPHI is the first Phi node. if (&InductionPHI->getParent()->front() != InductionPHI) InductionPHI->moveBefore(&InductionPHI->getParent()->front()); @@ -1194,20 +1286,20 @@ bool LoopInterchangeTransform::transform() { // incremented/decremented. // TODO: This splitting logic may not work always. Fix this. splitInnerLoopLatch(InnerIndexVar); - DEBUG(dbgs() << "splitInnerLoopLatch done\n"); + LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n"); // Splits the inner loops phi nodes out into a separate basic block. - splitInnerLoopHeader(); - DEBUG(dbgs() << "splitInnerLoopHeader done\n"); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); } Transformed |= adjustLoopLinks(); if (!Transformed) { - DEBUG(dbgs() << "adjustLoopLinks failed\n"); + LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); return false; } - restructureLoops(InnerLoop, OuterLoop); return true; } @@ -1217,38 +1309,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); } -void LoopInterchangeTransform::splitInnerLoopHeader() { - // Split the inner loop header out. Here make sure that the reduction PHI's - // stay in the innerloop body. - BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - if (InnerLoopHasReduction) { - // Note: The induction PHI must be the first PHI for this to work - BasicBlock *New = InnerLoopHeader->splitBasicBlock( - ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split"); - if (LI) - if (Loop *L = LI->getLoopFor(InnerLoopHeader)) - L->addBasicBlockToLoop(New, *LI); - - // Adjust Reduction PHI's in the block. - SmallVector<PHINode *, 8> PHIVec; - for (auto I = New->begin(); isa<PHINode>(I); ++I) { - PHINode *PHI = dyn_cast<PHINode>(I); - Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); - PHI->replaceAllUsesWith(V); - PHIVec.push_back((PHI)); - } - for (PHINode *P : PHIVec) { - P->eraseFromParent(); - } - } else { - SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); - } - - DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " - "InnerLoopHeader\n"); -} - /// \brief Move all instructions except the terminator from FromBB right before /// InsertBefore static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { @@ -1262,18 +1322,40 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, BasicBlock *NewPred) { - for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) { - PHINode *PHI = cast<PHINode>(I); - unsigned Num = PHI->getNumIncomingValues(); + for (PHINode &PHI : CurrBlock->phis()) { + unsigned Num = PHI.getNumIncomingValues(); for (unsigned i = 0; i < Num; ++i) { - if (PHI->getIncomingBlock(i) == OldPred) - PHI->setIncomingBlock(i, NewPred); + if (PHI.getIncomingBlock(i) == OldPred) + PHI.setIncomingBlock(i, NewPred); + } + } +} + +/// Update BI to jump to NewBB instead of OldBB. Records updates to +/// the dominator tree in DTUpdates, if DT should be preserved. +static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, + BasicBlock *NewBB, + std::vector<DominatorTree::UpdateType> &DTUpdates) { + assert(llvm::count_if(BI->successors(), + [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && + "BI must jump to OldBB at most once."); + for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { + if (BI->getSuccessor(i) == OldBB) { + BI->setSuccessor(i, NewBB); + + DTUpdates.push_back( + {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); + DTUpdates.push_back( + {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); + break; } } } bool LoopInterchangeTransform::adjustLoopBranches() { - DEBUG(dbgs() << "adjustLoopBranches called\n"); + LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); + std::vector<DominatorTree::UpdateType> DTUpdates; + // Adjust the loop preheader BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); @@ -1313,27 +1395,18 @@ bool LoopInterchangeTransform::adjustLoopBranches() { return false; // Adjust Loop Preheader and headers - - unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); - for (unsigned i = 0; i < NumSucc; ++i) { - if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) - OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); - } - - NumSucc = OuterLoopHeaderBI->getNumSuccessors(); - for (unsigned i = 0; i < NumSucc; ++i) { - if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) - OuterLoopHeaderBI->setSuccessor(i, LoopExit); - else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) - OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); - } + updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, + InnerLoopPreHeader, DTUpdates); + updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); + updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, + InnerLoopHeaderSuccessor, DTUpdates); // Adjust reduction PHI's now that the incoming block has changed. updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, OuterLoopHeader); - BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); - InnerLoopHeaderBI->eraseFromParent(); + updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, + OuterLoopPreHeader, DTUpdates); // -------------Adjust loop latches----------- if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) @@ -1341,19 +1414,15 @@ bool LoopInterchangeTransform::adjustLoopBranches() { else InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); - NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); - for (unsigned i = 0; i < NumSucc; ++i) { - if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) - InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); - } + updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, + InnerLoopLatchSuccessor, DTUpdates); // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with // the value and remove this PHI node from inner loop. SmallVector<PHINode *, 8> LcssaVec; - for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) { - PHINode *LcssaPhi = cast<PHINode>(I); - LcssaVec.push_back(LcssaPhi); - } + for (PHINode &P : InnerLoopLatchSuccessor->phis()) + LcssaVec.push_back(&P); + for (PHINode *P : LcssaVec) { Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); P->replaceAllUsesWith(Incoming); @@ -1365,19 +1434,52 @@ bool LoopInterchangeTransform::adjustLoopBranches() { else OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); - if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) - InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); - else - InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); + updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, + OuterLoopLatchSuccessor, DTUpdates); + updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, + DTUpdates); updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); - if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) { - OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); - } else { - OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); + DT->applyUpdates(DTUpdates); + restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, + OuterLoopPreHeader); + + // Now update the reduction PHIs in the inner and outer loop headers. + SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; + for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) + InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); + for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1)) + OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); + + for (PHINode *PHI : OuterLoopPHIs) + PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); + + // Move the PHI nodes from the inner loop header to the outer loop header. + // We have to deal with one kind of PHI nodes: + // 1) PHI nodes that are part of inner loop-only reductions. + // We only have to move the PHI node and update the incoming blocks. + for (PHINode *PHI : InnerLoopPHIs) { + PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); + for (BasicBlock *InBB : PHI->blocks()) { + if (InnerLoop->contains(InBB)) + continue; + + assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) && + "Unexpected incoming PHI node, reductions in outer loop are not " + "supported yet"); + PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB)); + PHI->eraseFromParent(); + break; + } } + // Update the incoming blocks for moved PHI nodes. + updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader); + updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch); + updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader); + updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch); + return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index dfa5ec1f354d..19bd9ebcc15b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -25,7 +25,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -52,6 +52,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <algorithm> #include <cassert> @@ -79,7 +80,7 @@ STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); namespace { -/// \brief Represent a store-to-forwarding candidate. +/// Represent a store-to-forwarding candidate. struct StoreToLoadForwardingCandidate { LoadInst *Load; StoreInst *Store; @@ -87,7 +88,7 @@ struct StoreToLoadForwardingCandidate { StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) : Load(Load), Store(Store) {} - /// \brief Return true if the dependence from the store to the load has a + /// Return true if the dependence from the store to the load has a /// distance of one. E.g. A[i+1] = A[i] bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L) const { @@ -136,7 +137,7 @@ struct StoreToLoadForwardingCandidate { } // end anonymous namespace -/// \brief Check if the store dominates all latches, so as long as there is no +/// Check if the store dominates all latches, so as long as there is no /// intervening store this value will be loaded in the next iteration. static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, DominatorTree *DT) { @@ -147,21 +148,21 @@ static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, }); } -/// \brief Return true if the load is not executed on all paths in the loop. +/// Return true if the load is not executed on all paths in the loop. static bool isLoadConditional(LoadInst *Load, Loop *L) { return Load->getParent() != L->getHeader(); } namespace { -/// \brief The per-loop class that does most of the work. +/// The per-loop class that does most of the work. class LoadEliminationForLoop { public: LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, DominatorTree *DT) : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {} - /// \brief Look through the loop-carried and loop-independent dependences in + /// Look through the loop-carried and loop-independent dependences in /// this loop and find store->load dependences. /// /// Note that no candidate is returned if LAA has failed to analyze the loop @@ -178,7 +179,7 @@ public: // forward and backward dependences qualify. Disqualify loads that have // other unknown dependences. - SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; + SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence; for (const auto &Dep : *Deps) { Instruction *Source = Dep.getSource(LAI); @@ -222,14 +223,14 @@ public: return Candidates; } - /// \brief Return the index of the instruction according to program order. + /// Return the index of the instruction according to program order. unsigned getInstrIndex(Instruction *Inst) { auto I = InstOrder.find(Inst); assert(I != InstOrder.end() && "No index for instruction"); return I->second; } - /// \brief If a load has multiple candidates associated (i.e. different + /// If a load has multiple candidates associated (i.e. different /// stores), it means that it could be forwarding from multiple stores /// depending on control flow. Remove these candidates. /// @@ -284,22 +285,24 @@ public: Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { if (LoadToSingleCand[Cand.Load] != &Cand) { - DEBUG(dbgs() << "Removing from candidates: \n" << Cand - << " The load may have multiple stores forwarding to " - << "it\n"); + LLVM_DEBUG( + dbgs() << "Removing from candidates: \n" + << Cand + << " The load may have multiple stores forwarding to " + << "it\n"); return true; } return false; }); } - /// \brief Given two pointers operations by their RuntimePointerChecking + /// Given two pointers operations by their RuntimePointerChecking /// indices, return true if they require an alias check. /// /// We need a check if one is a pointer for a candidate load and the other is /// a pointer for a possibly intervening store. bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, - const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, + const SmallPtrSet<Value *, 4> &PtrsWrittenOnFwdingPath, const std::set<Value *> &CandLoadPtrs) { Value *Ptr1 = LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; @@ -309,11 +312,11 @@ public: (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); } - /// \brief Return pointers that are possibly written to on the path from a + /// Return pointers that are possibly written to on the path from a /// forwarding store to a load. /// /// These pointers need to be alias-checked against the forwarding candidates. - SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( + SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath( const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { // From FirstStore to LastLoad neither of the elimination candidate loads // should overlap with any of the stores. @@ -351,7 +354,7 @@ public: // We're looking for stores after the first forwarding store until the end // of the loop, then from the beginning of the loop until the last // forwarded-to load. Collect the pointer for the stores. - SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; + SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath; auto InsertStorePtr = [&](Instruction *I) { if (auto *S = dyn_cast<StoreInst>(I)) @@ -366,16 +369,16 @@ public: return PtrsWrittenOnFwdingPath; } - /// \brief Determine the pointer alias checks to prove that there are no + /// Determine the pointer alias checks to prove that there are no /// intervening stores. SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { - SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = + SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath = findPointersWrittenOnForwardingPath(Candidates); // Collect the pointers of the candidate loads. - // FIXME: SmallSet does not work with std::inserter. + // FIXME: SmallPtrSet does not work with std::inserter. std::set<Value *> CandLoadPtrs; transform(Candidates, std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), @@ -394,13 +397,14 @@ public: return false; }); - DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); - DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() + << "):\n"); + LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); return Checks; } - /// \brief Perform the transformation for a candidate. + /// Perform the transformation for a candidate. void propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, SCEVExpander &SEE) { @@ -436,11 +440,11 @@ public: Cand.Load->replaceAllUsesWith(PHI); } - /// \brief Top-level driver for each loop: find store->load forwarding + /// Top-level driver for each loop: find store->load forwarding /// candidates, add run-time checks and perform transformation. bool processLoop() { - DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() - << "\" checking " << *L << "\n"); + LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); // Look for store-to-load forwarding cases across the // backedge. E.g.: @@ -479,7 +483,7 @@ public: SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; unsigned NumForwarding = 0; for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { - DEBUG(dbgs() << "Candidate " << Cand); + LLVM_DEBUG(dbgs() << "Candidate " << Cand); // Make sure that the stored values is available everywhere in the loop in // the next iteration. @@ -498,9 +502,10 @@ public: continue; ++NumForwarding; - DEBUG(dbgs() - << NumForwarding - << ". Valid store-to-load forwarding across the loop backedge\n"); + LLVM_DEBUG( + dbgs() + << NumForwarding + << ". Valid store-to-load forwarding across the loop backedge\n"); Candidates.push_back(Cand); } if (Candidates.empty()) @@ -513,25 +518,26 @@ public: // Too many checks are likely to outweigh the benefits of forwarding. if (Checks.size() > Candidates.size() * CheckPerElim) { - DEBUG(dbgs() << "Too many run-time checks needed.\n"); + LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n"); return false; } if (LAI.getPSE().getUnionPredicate().getComplexity() > LoadElimSCEVCheckThreshold) { - DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); return false; } if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { if (L->getHeader()->getParent()->optForSize()) { - DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing " - "for size.\n"); + LLVM_DEBUG( + dbgs() << "Versioning is needed but not allowed when optimizing " + "for size.\n"); return false; } if (!L->isLoopSimplifyForm()) { - DEBUG(dbgs() << "Loop is not is loop-simplify form"); + LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form"); return false; } @@ -558,7 +564,7 @@ public: private: Loop *L; - /// \brief Maps the load/store instructions to their index according to + /// Maps the load/store instructions to their index according to /// program order. DenseMap<Instruction *, unsigned> InstOrder; @@ -599,7 +605,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, namespace { -/// \brief The pass. Most of the work is delegated to the per-loop +/// The pass. Most of the work is delegated to the per-loop /// LoadEliminationForLoop class. class LoopLoadElimination : public FunctionPass { public: diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 2e4c7b19e476..561ceea1d880 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -155,7 +155,7 @@ // When S = -1 (i.e. reverse iterating loop), the transformation is supported // when: // * The loop has a single latch with the condition of the form: -// B(X) = X <pred> latchLimit, where <pred> is u> or s>. +// B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=. // * The guard condition is of the form // G(X) = X - 1 u< guardLimit // @@ -171,9 +171,14 @@ // guardStart u< guardLimit && latchLimit u>= 1. // Similarly for sgt condition the widened condition is: // guardStart u< guardLimit && latchLimit s>= 1. +// For uge condition the widened condition is: +// guardStart u< guardLimit && latchLimit u> 1. +// For sge condition the widened condition is: +// guardStart u< guardLimit && latchLimit s> 1. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopPredication.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -198,6 +203,20 @@ static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation", static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop", cl::Hidden, cl::init(true)); + +static cl::opt<bool> + SkipProfitabilityChecks("loop-predication-skip-profitability-checks", + cl::Hidden, cl::init(false)); + +// This is the scale factor for the latch probability. We use this during +// profitability analysis to find other exiting blocks that have a much higher +// probability of exiting the loop instead of loop exiting via latch. +// This value should be greater than 1 for a sane profitability check. +static cl::opt<float> LatchExitProbabilityScale( + "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0), + cl::desc("scale factor for the latch probability. Value should be greater " + "than 1. Lower values are ignored")); + namespace { class LoopPredication { /// Represents an induction variable check: @@ -217,6 +236,7 @@ class LoopPredication { }; ScalarEvolution *SE; + BranchProbabilityInfo *BPI; Loop *L; const DataLayout *DL; @@ -250,6 +270,12 @@ class LoopPredication { IRBuilder<> &Builder); bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + // If the loop always exits through another block in the loop, we should not + // predicate based on the latch check. For example, the latch check can be a + // very coarse grained check and there can be more fine grained exit checks + // within the loop. We identify such unprofitable loops through BPI. + bool isLoopProfitableToPredicate(); + // When the IV type is wider than the range operand type, we can still do loop // predication, by generating SCEVs for the range and latch that are of the // same type. We achieve this by generating a SCEV truncate expression for the @@ -266,8 +292,10 @@ class LoopPredication { // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do // so. Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType); + public: - LoopPredication(ScalarEvolution *SE) : SE(SE){}; + LoopPredication(ScalarEvolution *SE, BranchProbabilityInfo *BPI) + : SE(SE), BPI(BPI){}; bool runOnLoop(Loop *L); }; @@ -279,6 +307,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BranchProbabilityInfoWrapperPass>(); getLoopAnalysisUsage(AU); } @@ -286,7 +315,9 @@ public: if (skipLoop(L)) return false; auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - LoopPredication LP(SE); + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); + LoopPredication LP(SE, &BPI); return LP.runOnLoop(L); } }; @@ -296,6 +327,7 @@ char LoopPredicationLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication", "Loop predication", false, false) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication", "Loop predication", false, false) @@ -307,7 +339,11 @@ Pass *llvm::createLoopPredicationPass() { PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - LoopPredication LP(&AR.SE); + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); + Function *F = L.getHeader()->getParent(); + auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F); + LoopPredication LP(&AR.SE, BPI); if (!LP.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -375,11 +411,11 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { if (!NewLatchCheck.IV) return None; NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType); - DEBUG(dbgs() << "IV of type: " << *LatchType - << "can be represented as range check type:" << *RangeCheckType - << "\n"); - DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); - DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); + LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType + << "can be represented as range check type:" + << *RangeCheckType << "\n"); + LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); + LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); return NewLatchCheck; } @@ -412,30 +448,15 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop( SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || !CanExpand(LatchLimit) || !CanExpand(RHS)) { - DEBUG(dbgs() << "Can't expand limit check!\n"); + LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); return None; } - ICmpInst::Predicate LimitCheckPred; - switch (LatchCheck.Pred) { - case ICmpInst::ICMP_ULT: - LimitCheckPred = ICmpInst::ICMP_ULE; - break; - case ICmpInst::ICMP_ULE: - LimitCheckPred = ICmpInst::ICMP_ULT; - break; - case ICmpInst::ICMP_SLT: - LimitCheckPred = ICmpInst::ICMP_SLE; - break; - case ICmpInst::ICMP_SLE: - LimitCheckPred = ICmpInst::ICMP_SLT; - break; - default: - llvm_unreachable("Unsupported loop latch!"); - } + auto LimitCheckPred = + ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred); - DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); - DEBUG(dbgs() << "RHS: " << *RHS << "\n"); - DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); + LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); + LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n"); + LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); Instruction *InsertAt = Preheader->getTerminator(); auto *LimitCheck = @@ -454,16 +475,16 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop( const SCEV *LatchLimit = LatchCheck.Limit; if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || !CanExpand(LatchLimit)) { - DEBUG(dbgs() << "Can't expand limit check!\n"); + LLVM_DEBUG(dbgs() << "Can't expand limit check!\n"); return None; } // The decrement of the latch check IV should be the same as the // rangeCheckIV. auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE); if (RangeCheck.IV != PostDecLatchCheckIV) { - DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: " - << *PostDecLatchCheckIV - << " and RangeCheckIV: " << *RangeCheck.IV << "\n"); + LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: " + << *PostDecLatchCheckIV + << " and RangeCheckIV: " << *RangeCheck.IV << "\n"); return None; } @@ -472,9 +493,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop( // latchLimit <pred> 1. // See the header comment for reasoning of the checks. Instruction *InsertAt = Preheader->getTerminator(); - auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred) - ? ICmpInst::ICMP_SGE - : ICmpInst::ICMP_UGE; + auto LimitCheckPred = + ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred); auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT, GuardStart, GuardLimit, InsertAt); auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, @@ -488,8 +508,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop( Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, IRBuilder<> &Builder) { - DEBUG(dbgs() << "Analyzing ICmpInst condition:\n"); - DEBUG(ICI->dump()); + LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n"); + LLVM_DEBUG(ICI->dump()); // parseLoopStructure guarantees that the latch condition is: // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=. @@ -497,34 +517,34 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, // i u< guardLimit auto RangeCheck = parseLoopICmp(ICI); if (!RangeCheck) { - DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); + LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); return None; } - DEBUG(dbgs() << "Guard check:\n"); - DEBUG(RangeCheck->dump()); + LLVM_DEBUG(dbgs() << "Guard check:\n"); + LLVM_DEBUG(RangeCheck->dump()); if (RangeCheck->Pred != ICmpInst::ICMP_ULT) { - DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred - << ")!\n"); + LLVM_DEBUG(dbgs() << "Unsupported range check predicate(" + << RangeCheck->Pred << ")!\n"); return None; } auto *RangeCheckIV = RangeCheck->IV; if (!RangeCheckIV->isAffine()) { - DEBUG(dbgs() << "Range check IV is not affine!\n"); + LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n"); return None; } auto *Step = RangeCheckIV->getStepRecurrence(*SE); // We cannot just compare with latch IV step because the latch and range IVs // may have different types. if (!isSupportedStep(Step)) { - DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); + LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } auto *Ty = RangeCheckIV->getType(); auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty); if (!CurrLatchCheckOpt) { - DEBUG(dbgs() << "Failed to generate a loop latch check " - "corresponding to range type: " - << *Ty << "\n"); + LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check " + "corresponding to range type: " + << *Ty << "\n"); return None; } @@ -535,7 +555,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() && "Range and latch steps should be of same type!"); if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) { - DEBUG(dbgs() << "Range and latch have different step values!\n"); + LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n"); return None; } @@ -551,14 +571,14 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, SCEVExpander &Expander) { - DEBUG(dbgs() << "Processing guard:\n"); - DEBUG(Guard->dump()); + LLVM_DEBUG(dbgs() << "Processing guard:\n"); + LLVM_DEBUG(Guard->dump()); IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator())); // The guard condition is expected to be in form of: // cond1 && cond2 && cond3 ... - // Iterate over subconditions looking for for icmp conditions which can be + // Iterate over subconditions looking for icmp conditions which can be // widened across loop iterations. Widening these conditions remember the // resulting list of subconditions in Checks vector. SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0)); @@ -605,7 +625,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, LastCheck = Builder.CreateAnd(LastCheck, Check); Guard->setOperand(0, LastCheck); - DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); + LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); return true; } @@ -614,7 +634,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() { BasicBlock *LoopLatch = L->getLoopLatch(); if (!LoopLatch) { - DEBUG(dbgs() << "The loop doesn't have a single latch!\n"); + LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n"); return None; } @@ -625,7 +645,7 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() { if (!match(LoopLatch->getTerminator(), m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest, FalseDest))) { - DEBUG(dbgs() << "Failed to match the latch terminator!\n"); + LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n"); return None; } assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) && @@ -635,20 +655,20 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() { auto Result = parseLoopICmp(Pred, LHS, RHS); if (!Result) { - DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); + LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); return None; } // Check affine first, so if it's not we don't try to compute the step // recurrence. if (!Result->IV->isAffine()) { - DEBUG(dbgs() << "The induction variable is not affine!\n"); + LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n"); return None; } auto *Step = Result->IV->getStepRecurrence(*SE); if (!isSupportedStep(Step)) { - DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); + LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); return None; } @@ -658,13 +678,14 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() { Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE; } else { assert(Step->isAllOnesValue() && "Step should be -1!"); - return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT; + return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT && + Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE; } }; if (IsUnsupportedPredicate(Step, Result->Pred)) { - DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred - << ")!\n"); + LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred + << ")!\n"); return None; } return Result; @@ -700,11 +721,65 @@ bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) { Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; } +bool LoopPredication::isLoopProfitableToPredicate() { + if (SkipProfitabilityChecks || !BPI) + return true; + + SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 8> ExitEdges; + L->getExitEdges(ExitEdges); + // If there is only one exiting edge in the loop, it is always profitable to + // predicate the loop. + if (ExitEdges.size() == 1) + return true; + + // Calculate the exiting probabilities of all exiting edges from the loop, + // starting with the LatchExitProbability. + // Heuristic for profitability: If any of the exiting blocks' probability of + // exiting the loop is larger than exiting through the latch block, it's not + // profitable to predicate the loop. + auto *LatchBlock = L->getLoopLatch(); + assert(LatchBlock && "Should have a single latch at this point!"); + auto *LatchTerm = LatchBlock->getTerminator(); + assert(LatchTerm->getNumSuccessors() == 2 && + "expected to be an exiting block with 2 succs!"); + unsigned LatchBrExitIdx = + LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0; + BranchProbability LatchExitProbability = + BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx); + + // Protect against degenerate inputs provided by the user. Providing a value + // less than one, can invert the definition of profitable loop predication. + float ScaleFactor = LatchExitProbabilityScale; + if (ScaleFactor < 1) { + LLVM_DEBUG( + dbgs() + << "Ignored user setting for loop-predication-latch-probability-scale: " + << LatchExitProbabilityScale << "\n"); + LLVM_DEBUG(dbgs() << "The value is set to 1.0\n"); + ScaleFactor = 1.0; + } + const auto LatchProbabilityThreshold = + LatchExitProbability * ScaleFactor; + + for (const auto &ExitEdge : ExitEdges) { + BranchProbability ExitingBlockProbability = + BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second); + // Some exiting edge has higher probability than the latch exiting edge. + // No longer profitable to predicate. + if (ExitingBlockProbability > LatchProbabilityThreshold) + return false; + } + // Using BPI, we have concluded that the most probable way to exit from the + // loop is through the latch (or there's no profile information and all + // exits are equally likely). + return true; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; - DEBUG(dbgs() << "Analyzing "); - DEBUG(L->dump()); + LLVM_DEBUG(dbgs() << "Analyzing "); + LLVM_DEBUG(L->dump()); Module *M = L->getHeader()->getModule(); @@ -725,9 +800,13 @@ bool LoopPredication::runOnLoop(Loop *Loop) { return false; LatchCheck = *LatchCheckOpt; - DEBUG(dbgs() << "Latch check:\n"); - DEBUG(LatchCheck.dump()); + LLVM_DEBUG(dbgs() << "Latch check:\n"); + LLVM_DEBUG(LatchCheck.dump()); + if (!isLoopProfitableToPredicate()) { + LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n"); + return false; + } // Collect all the guards into a vector and process later, so as not // to invalidate the instruction iterator. SmallVector<IntrinsicInst *, 4> Guards; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index d1a54b877950..9a99e5925572 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -17,7 +17,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -28,6 +28,7 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -51,8 +52,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> #include <cstddef> @@ -69,10 +70,6 @@ using namespace llvm; STATISTIC(NumRerolledLoops, "Number of rerolled loops"); static cl::opt<unsigned> -MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, - cl::desc("The maximum increment for loop rerolling")); - -static cl::opt<unsigned> NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), cl::Hidden, cl::desc("The maximum number of failures to tolerate" @@ -188,7 +185,7 @@ namespace { bool PreserveLCSSA; using SmallInstructionVector = SmallVector<Instruction *, 16>; - using SmallInstructionSet = SmallSet<Instruction *, 16>; + using SmallInstructionSet = SmallPtrSet<Instruction *, 16>; // Map between induction variable and its increment DenseMap<Instruction *, int64_t> IVToIncMap; @@ -397,8 +394,8 @@ namespace { /// Stage 3: Assuming validate() returned true, perform the /// replacement. - /// @param IterCount The maximum iteration count of L. - void replace(const SCEV *IterCount); + /// @param BackedgeTakenCount The backedge-taken count of L. + void replace(const SCEV *BackedgeTakenCount); protected: using UsesTy = MapVector<Instruction *, BitVector>; @@ -428,8 +425,7 @@ namespace { bool instrDependsOn(Instruction *I, UsesTy::iterator Start, UsesTy::iterator End); - void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount); - void updateNonLoopCtrlIncr(); + void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr); LoopReroll *Parent; @@ -482,8 +478,8 @@ namespace { void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); void collectPossibleReductions(Loop *L, ReductionTracker &Reductions); - bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, - ReductionTracker &Reductions); + bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *BackedgeTakenCount, ReductionTracker &Reductions); }; } // end anonymous namespace @@ -510,48 +506,6 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { return false; } -static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE, - const SCEV *SCEVExpr, - Instruction &IV) { - const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr); - - // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)), - // Return c1. - if (!MulSCEV && IV.getType()->isPointerTy()) - if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) { - const PointerType *PTy = cast<PointerType>(IV.getType()); - Type *ElTy = PTy->getElementType(); - const SCEV *SizeOfExpr = - SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy); - if (IncSCEV->getValue()->getValue().isNegative()) { - const SCEV *NewSCEV = - SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr); - return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV)); - } else { - return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr)); - } - } - - if (!MulSCEV) - return nullptr; - - // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant, - // Return c. - const SCEVConstant *CIncSCEV = nullptr; - for (const SCEV *Operand : MulSCEV->operands()) { - if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) { - CIncSCEV = Constant; - } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) { - Type *AllocTy; - if (!Unknown->isSizeOf(AllocTy)) - break; - } else { - return nullptr; - } - } - return CIncSCEV; -} - // Check if an IV is only used to control the loop. There are two cases: // 1. It only has one use which is loop increment, and the increment is only // used by comparison and the PHI (could has sext with nsw in between), and the @@ -632,25 +586,17 @@ void LoopReroll::collectPossibleIVs(Loop *L, continue; if (!PHISCEV->isAffine()) continue; - const SCEVConstant *IncSCEV = nullptr; - if (I->getType()->isPointerTy()) - IncSCEV = - getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I); - else - IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); + auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); if (IncSCEV) { - const APInt &AInt = IncSCEV->getValue()->getValue().abs(); - if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc)) - continue; IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); - DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV - << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV + << "\n"); if (isLoopControlIV(L, &*I)) { assert(!LoopControlIV && "Found two loop control only IV"); LoopControlIV = &(*I); - DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = " - << *PHISCEV << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I + << " = " << *PHISCEV << "\n"); } else PossibleIVs.push_back(&*I); } @@ -717,8 +663,8 @@ void LoopReroll::collectPossibleReductions(Loop *L, if (!SLR.valid()) continue; - DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << - SLR.size() << " chained instructions)\n"); + LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " + << SLR.size() << " chained instructions)\n"); Reductions.addSLR(SLR); } } @@ -856,7 +802,8 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { BaseUsers.push_back(II); continue; } else { - DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I + << "\n"); return false; } } @@ -878,7 +825,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { // away. if (BaseUsers.size()) { if (Roots.find(0) != Roots.end()) { - DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); + LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); return false; } Roots[0] = Base; @@ -894,9 +841,9 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { if (KV.first == 0) continue; if (!KV.second->hasNUses(NumBaseUses)) { - DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " - << "#Base=" << NumBaseUses << ", #Root=" << - KV.second->getNumUses() << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " + << "#Base=" << NumBaseUses + << ", #Root=" << KV.second->getNumUses() << "\n"); return false; } } @@ -1024,13 +971,14 @@ bool LoopReroll::DAGRootTracker::findRoots() { // Ensure all sets have the same size. if (RootSets.empty()) { - DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); + LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); return false; } for (auto &V : RootSets) { if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { - DEBUG(dbgs() - << "LRR: Aborting because not all root sets have the same size\n"); + LLVM_DEBUG( + dbgs() + << "LRR: Aborting because not all root sets have the same size\n"); return false; } } @@ -1038,13 +986,14 @@ bool LoopReroll::DAGRootTracker::findRoots() { Scale = RootSets[0].Roots.size() + 1; if (Scale > IL_MaxRerollIterations) { - DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " - << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations - << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " + << "#Found=" << Scale + << ", #Max=" << IL_MaxRerollIterations << "\n"); return false; } - DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale + << "\n"); return true; } @@ -1078,7 +1027,7 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po // While we're here, check the use sets are the same size. if (V.size() != VBase.size()) { - DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); + LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); return false; } @@ -1235,17 +1184,17 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // set. for (auto &KV : Uses) { if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { - DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " - << *KV.first << " (#uses=" << KV.second.count() << ")\n"); + LLVM_DEBUG( + dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " + << *KV.first << " (#uses=" << KV.second.count() << ")\n"); return false; } } - DEBUG( - for (auto &KV : Uses) { - dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; - } - ); + LLVM_DEBUG(for (auto &KV + : Uses) { + dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; + }); for (unsigned Iter = 1; Iter < Scale; ++Iter) { // In addition to regular aliasing information, we need to look for @@ -1304,8 +1253,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { if (TryIt == Uses.end() || TryIt == RootIt || instrDependsOn(TryIt->first, RootIt, TryIt)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << - " vs. " << *RootInst << "\n"); + LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " + << *BaseInst << " vs. " << *RootInst << "\n"); return false; } @@ -1341,8 +1290,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // root instruction, does not also belong to the base set or the set of // some other root instruction. if (RootIt->second.count() > 1) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << - " vs. " << *RootInst << " (prev. case overlap)\n"); + LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (prev. case overlap)\n"); return false; } @@ -1352,8 +1301,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { if (RootInst->mayReadFromMemory()) for (auto &K : AST) { if (K.aliasesUnknownInst(RootInst, *AA)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << - " vs. " << *RootInst << " (depends on future store)\n"); + LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " + << *BaseInst << " vs. " << *RootInst + << " (depends on future store)\n"); return false; } } @@ -1366,9 +1316,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { !isSafeToSpeculativelyExecute(BaseInst)) || (!isUnorderedLoadStore(RootInst) && !isSafeToSpeculativelyExecute(RootInst)))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << - " vs. " << *RootInst << - " (side effects prevent reordering)\n"); + LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst + << " (side effects prevent reordering)\n"); return false; } @@ -1419,8 +1369,9 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { BaseInst->getOperand(!j) == Op2) { Swapped = true; } else { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst - << " vs. " << *RootInst << " (operand " << j << ")\n"); + LLVM_DEBUG(dbgs() + << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (operand " << j << ")\n"); return false; } } @@ -1433,8 +1384,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { hasUsesOutsideLoop(BaseInst, L)) || (!PossibleRedLastSet.count(RootInst) && hasUsesOutsideLoop(RootInst, L))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << - " vs. " << *RootInst << " (uses outside loop)\n"); + LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (uses outside loop)\n"); return false; } @@ -1451,20 +1402,32 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { "Mismatched set sizes!"); } - DEBUG(dbgs() << "LRR: Matched all iteration increments for " << - *IV << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV + << "\n"); return true; } -void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { +void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { BasicBlock *Header = L->getHeader(); + + // Compute the start and increment for each BaseInst before we start erasing + // instructions. + SmallVector<const SCEV *, 8> StartExprs; + SmallVector<const SCEV *, 8> IncrExprs; + for (auto &DRS : RootSets) { + const SCEVAddRecExpr *IVSCEV = + cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); + StartExprs.push_back(IVSCEV->getStart()); + IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV)); + } + // Remove instructions associated with non-base iterations. for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend(); J != JE;) { unsigned I = Uses[&*J].find_first(); if (I > 0 && I < IL_All) { - DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); + LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); J++->eraseFromParent(); continue; } @@ -1472,74 +1435,47 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } - bool HasTwoIVs = LoopControlIV && LoopControlIV != IV; + // Rewrite each BaseInst using SCEV. + for (size_t i = 0, e = RootSets.size(); i != e; ++i) + // Insert the new induction variable. + replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]); - if (HasTwoIVs) { - updateNonLoopCtrlIncr(); - replaceIV(LoopControlIV, LoopControlIV, IterCount); - } else - // We need to create a new induction variable for each different BaseInst. - for (auto &DRS : RootSets) - // Insert the new induction variable. - replaceIV(DRS.BaseInst, IV, IterCount); + { // Limit the lifetime of SCEVExpander. + BranchInst *BI = cast<BranchInst>(Header->getTerminator()); + const DataLayout &DL = Header->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "reroll"); + auto Zero = SE->getZero(BackedgeTakenCount->getType()); + auto One = SE->getOne(BackedgeTakenCount->getType()); + auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap); + Value *NewIV = + Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(), + Header->getFirstNonPHIOrDbg()); + // FIXME: This arithmetic can overflow. + auto TripCount = SE->getAddExpr(BackedgeTakenCount, One); + auto ScaledTripCount = SE->getMulExpr( + TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale)); + auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One); + Value *TakenCount = + Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(), + Header->getFirstNonPHIOrDbg()); + Value *Cond = + new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } SimplifyInstructionsInBlock(Header, TLI); DeleteDeadPHIs(Header, TLI); } -// For non-loop-control IVs, we only need to update the last increment -// with right amount, then we are done. -void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() { - const SCEV *NewInc = nullptr; - for (auto *LoopInc : LoopIncs) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc); - const SCEVConstant *COp = nullptr; - if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) { - COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1))); - } else { - COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0))); - if (!COp) - COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1))); - } - - assert(COp && "Didn't find constant operand of LoopInc!\n"); - - const APInt &AInt = COp->getValue()->getValue(); - const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale); - if (AInt.isNegative()) { - NewInc = SE->getNegativeSCEV(COp); - NewInc = SE->getUDivExpr(NewInc, ScaleSCEV); - NewInc = SE->getNegativeSCEV(NewInc); - } else - NewInc = SE->getUDivExpr(COp, ScaleSCEV); - - LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue()); - } -} - -void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst, - Instruction *InstIV, - const SCEV *IterCount) { +void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS, + const SCEV *Start, + const SCEV *IncrExpr) { BasicBlock *Header = L->getHeader(); - int64_t Inc = IVToIncMap[InstIV]; - bool NeedNewIV = InstIV == LoopControlIV; - bool Negative = !NeedNewIV && Inc < 0; - - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst)); - const SCEV *Start = RealIVSCEV->getStart(); - - if (NeedNewIV) - Start = SE->getConstant(Start->getType(), 0); - - const SCEV *SizeOfExpr = nullptr; - const SCEV *IncrExpr = - SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1); - if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) { - Type *ElTy = PTy->getElementType(); - SizeOfExpr = - SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy); - IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr); - } + Instruction *Inst = DRS.BaseInst; + const SCEV *NewIVSCEV = SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); @@ -1552,54 +1488,6 @@ void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst, for (auto &KV : Uses) if (KV.second.find_first() == 0) KV.first->replaceUsesOfWith(Inst, NewIV); - - if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { - // FIXME: Why do we need this check? - if (Uses[BI].find_first() == IL_All) { - const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); - - if (NeedNewIV) - ICSCEV = SE->getMulExpr(IterCount, - SE->getConstant(IterCount->getType(), Scale)); - - // Iteration count SCEV minus or plus 1 - const SCEV *MinusPlus1SCEV = - SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1); - if (Inst->getType()->isPointerTy()) { - assert(SizeOfExpr && "SizeOfExpr is not initialized"); - MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr); - } - - const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV); - // Iteration count minus 1 - Instruction *InsertPtr = nullptr; - if (isa<SCEVConstant>(ICMinusPlus1SCEV)) { - InsertPtr = BI; - } else { - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) - Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); - InsertPtr = Preheader->getTerminator(); - } - - if (!isa<PointerType>(NewIV->getType()) && NeedNewIV && - (SE->getTypeSizeInBits(NewIV->getType()) < - SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) { - IRBuilder<> Builder(BI); - Builder.SetCurrentDebugLocation(BI->getDebugLoc()); - NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType()); - } - Value *ICMinusPlus1 = Expander.expandCodeFor( - ICMinusPlus1SCEV, NewIV->getType(), InsertPtr); - - Value *Cond = - new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond"); - BI->setCondition(Cond); - - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); - } - } } } @@ -1617,17 +1505,17 @@ bool LoopReroll::ReductionTracker::validateSelected() { int Iter = PossibleRedIter[J]; if (Iter != PrevIter && Iter != PrevIter + 1 && !PossibleReds[i].getReducedValue()->isAssociative()) { - DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << - J << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " + << J << "\n"); return false; } if (Iter != PrevIter) { if (Count != BaseCount) { - DEBUG(dbgs() << "LRR: Iteration " << PrevIter << - " reduction use count " << Count << - " is not equal to the base use count " << - BaseCount << "\n"); + LLVM_DEBUG(dbgs() + << "LRR: Iteration " << PrevIter << " reduction use count " + << Count << " is not equal to the base use count " + << BaseCount << "\n"); return false; } @@ -1716,15 +1604,15 @@ void LoopReroll::ReductionTracker::replaceSelected() { // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions // have been validated), then we reroll the loop. bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, - const SCEV *IterCount, + const SCEV *BackedgeTakenCount, ReductionTracker &Reductions) { DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, IVToIncMap, LoopControlIV); if (!DAGRoots.findRoots()) return false; - DEBUG(dbgs() << "LRR: Found all root induction increments for: " << - *IV << "\n"); + LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV + << "\n"); if (!DAGRoots.validate(Reductions)) return false; @@ -1734,7 +1622,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // making changes! Reductions.replaceSelected(); - DAGRoots.replace(IterCount); + DAGRoots.replace(BackedgeTakenCount); ++NumRerolledLoops; return true; @@ -1752,9 +1640,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); - DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << - "] Loop %" << Header->getName() << " (" << - L->getNumBlocks() << " block(s))\n"); + LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %" + << Header->getName() << " (" << L->getNumBlocks() + << " block(s))\n"); // For now, we'll handle only single BB loops. if (L->getNumBlocks() > 1) @@ -1763,10 +1651,10 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (!SE->hasLoopInvariantBackedgeTakenCount(L)) return false; - const SCEV *LIBETC = SE->getBackedgeTakenCount(L); - const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); - DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); - DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); + LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount + << "\n"); // First, we need to find the induction variable with respect to which we can // reroll (there may be several possible options). @@ -1776,7 +1664,7 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { collectPossibleIVs(L, PossibleIVs); if (PossibleIVs.empty()) { - DEBUG(dbgs() << "LRR: No possible IVs found\n"); + LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n"); return false; } @@ -1787,11 +1675,11 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { // For each possible IV, collect the associated possible set of 'root' nodes // (i+1, i+2, etc.). for (Instruction *PossibleIV : PossibleIVs) - if (reroll(PossibleIV, L, Header, IterCount, Reductions)) { + if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) { Changed = true; break; } - DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); + LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); // Trip count of L has changed so SE must be re-evaluated. if (Changed) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index a91f53ba663f..eeaad39dc1d1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,33 +13,15 @@ #include "llvm/Transforms/Scalar/LoopRotation.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" -#include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopRotationUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; #define DEBUG_TYPE "loop-rotate" @@ -48,595 +30,6 @@ static cl::opt<unsigned> DefaultRotationThreshold( "rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); -STATISTIC(NumRotated, "Number of loops rotated"); - -namespace { -/// A simple loop rotation transformation. -class LoopRotate { - const unsigned MaxHeaderSize; - LoopInfo *LI; - const TargetTransformInfo *TTI; - AssumptionCache *AC; - DominatorTree *DT; - ScalarEvolution *SE; - const SimplifyQuery &SQ; - -public: - LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ) - : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), - SQ(SQ) {} - bool processLoop(Loop *L); - -private: - bool rotateLoop(Loop *L, bool SimplifiedLatch); - bool simplifyLoopLatch(Loop *L); -}; -} // end anonymous namespace - -/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the -/// old header into the preheader. If there were uses of the values produced by -/// these instruction that were outside of the loop, we have to insert PHI nodes -/// to merge the two values. Do this now. -static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, - BasicBlock *OrigPreheader, - ValueToValueMapTy &ValueMap, - SmallVectorImpl<PHINode*> *InsertedPHIs) { - // Remove PHI node entries that are no longer live. - BasicBlock::iterator I, E = OrigHeader->end(); - for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) - PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); - - // Now fix up users of the instructions in OrigHeader, inserting PHI nodes - // as necessary. - SSAUpdater SSA(InsertedPHIs); - for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = &*I; - - // If there are no uses of the value (e.g. because it returns void), there - // is nothing to rewrite. - if (OrigHeaderVal->use_empty()) - continue; - - Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); - - // The value now exits in two versions: the initial value in the preheader - // and the loop "next" value in the original header. - SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); - SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); - SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); - - // Visit each use of the OrigHeader instruction. - for (Value::use_iterator UI = OrigHeaderVal->use_begin(), - UE = OrigHeaderVal->use_end(); - UI != UE;) { - // Grab the use before incrementing the iterator. - Use &U = *UI; - - // Increment the iterator before removing the use from the list. - ++UI; - - // SSAUpdater can't handle a non-PHI use in the same block as an - // earlier def. We can easily handle those cases manually. - Instruction *UserInst = cast<Instruction>(U.getUser()); - if (!isa<PHINode>(UserInst)) { - BasicBlock *UserBB = UserInst->getParent(); - - // The original users in the OrigHeader are already using the - // original definitions. - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped. - if (UserBB == OrigPreheader) { - U = OrigPreHeaderVal; - continue; - } - } - - // Anything else can be handled by SSAUpdater. - SSA.RewriteUse(U); - } - - // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug - // intrinsics. - SmallVector<DbgValueInst *, 1> DbgValues; - llvm::findDbgValues(DbgValues, OrigHeaderVal); - for (auto &DbgValue : DbgValues) { - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = DbgValue->getParent(); - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute undef. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = UndefValue::get(OrigHeaderVal->getType()); - DbgValue->setOperand(0, - MetadataAsValue::get(OrigHeaderVal->getContext(), - ValueAsMetadata::get(NewVal))); - } - } -} - -/// Propagate dbg.value intrinsics through the newly inserted Phis. -static void insertDebugValues(BasicBlock *OrigHeader, - SmallVectorImpl<PHINode*> &InsertedPHIs) { - ValueToValueMapTy DbgValueMap; - - // Map existing PHI nodes to their dbg.values. - for (auto &I : *OrigHeader) { - if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) { - if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation())) - DbgValueMap.insert({Loc, DbgII}); - } - } - - // Then iterate through the new PHIs and look to see if they use one of the - // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will - // propagate the info through the new PHI. - LLVMContext &C = OrigHeader->getContext(); - for (auto PHI : InsertedPHIs) { - for (auto VI : PHI->operand_values()) { - auto V = DbgValueMap.find(VI); - if (V != DbgValueMap.end()) { - auto *DbgII = cast<DbgInfoIntrinsic>(V->second); - Instruction *NewDbgII = DbgII->clone(); - auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI)); - NewDbgII->setOperand(0, PhiMAV); - BasicBlock *Parent = PHI->getParent(); - NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime()); - } - } - } -} - -/// Rotate loop LP. Return true if the loop is rotated. -/// -/// \param SimplifiedLatch is true if the latch was just folded into the final -/// loop exit. In this case we may want to rotate even though the new latch is -/// now an exiting branch. This rotation would have happened had the latch not -/// been simplified. However, if SimplifiedLatch is false, then we avoid -/// rotating loops in which the latch exits to avoid excessive or endless -/// rotation. LoopRotate should be repeatable and converge to a canonical -/// form. This property is satisfied because simplifying the loop latch can only -/// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { - // If the loop has only one block then there is not much to rotate. - if (L->getBlocks().size() == 1) - return false; - - BasicBlock *OrigHeader = L->getHeader(); - BasicBlock *OrigLatch = L->getLoopLatch(); - - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI || BI->isUnconditional()) - return false; - - // If the loop header is not one of the loop exiting blocks then - // either this loop is already rotated or it is not - // suitable for loop rotation transformations. - if (!L->isLoopExiting(OrigHeader)) - return false; - - // If the loop latch already contains a branch that leaves the loop then the - // loop is already rotated. - if (!OrigLatch) - return false; - - // Rotate if either the loop latch does *not* exit the loop, or if the loop - // latch was just simplified. - if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) - return false; - - // Check size of original header and reject loop if it is very big or we can't - // duplicate blocks inside it. - { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); - if (Metrics.notDuplicatable) { - DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; - L->dump()); - return false; - } - if (Metrics.convergent) { - DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " - "instructions: "; - L->dump()); - return false; - } - if (Metrics.NumInsts > MaxHeaderSize) - return false; - } - - // Now, this loop is suitable for rotation. - BasicBlock *OrigPreheader = L->getLoopPreheader(); - - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!OrigPreheader) - return false; - - // Anything ScalarEvolution may know about this loop or the PHI nodes - // in its header will soon be invalidated. - if (SE) - SE->forgetLoop(L); - - DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); - - // Find new Loop header. NewHeader is a Header's one and only successor - // that is inside loop. Header's other successor is outside the - // loop. Otherwise loop is not suitable for rotation. - BasicBlock *Exit = BI->getSuccessor(0); - BasicBlock *NewHeader = BI->getSuccessor(1); - if (L->contains(Exit)) - std::swap(Exit, NewHeader); - assert(NewHeader && "Unable to determine new loop header"); - assert(L->contains(NewHeader) && !L->contains(Exit) && - "Unable to determine loop header and exit blocks"); - - // This code assumes that the new header has exactly one predecessor. - // Remove any single-entry PHI nodes in it. - assert(NewHeader->getSinglePredecessor() && - "New header doesn't have one pred!"); - FoldSingleEntryPHINodes(NewHeader); - - // Begin by walking OrigHeader and populating ValueMap with an entry for - // each Instruction. - BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - ValueToValueMapTy ValueMap; - - // For PHI nodes, the value available in OldPreHeader is just the - // incoming value from OldPreHeader. - for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); - - // For the rest of the instructions, either hoist to the OrigPreheader if - // possible or create a clone in the OldPreHeader if not. - TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); - - // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. - using DbgIntrinsicHash = - std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>; - auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash { - return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; - }; - SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics; - for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); - I != E; ++I) { - if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I)) - DbgIntrinsics.insert(makeHash(DII)); - else - break; - } - - while (I != E) { - Instruction *Inst = &*I++; - - // If the instruction's operands are invariant and it doesn't read or write - // memory, then it is safe to hoist. Doing this doesn't change the order of - // execution in the preheader, but does prevent the instruction from - // executing in each iteration of the loop. This means it is safe to hoist - // something that might trap, but isn't safe to hoist something that reads - // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && - !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) && - !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) { - Inst->moveBefore(LoopEntryBranch); - continue; - } - - // Otherwise, create a duplicate of the instruction. - Instruction *C = Inst->clone(); - - // Eagerly remap the operands of the instruction. - RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - - // Avoid inserting the same intrinsic twice. - if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C)) - if (DbgIntrinsics.count(makeHash(DII))) { - C->deleteValue(); - continue; - } - - // With the operands remapped, see if the instruction constant folds or is - // otherwise simplifyable. This commonly occurs because the entry from PHI - // nodes allows icmps and other instructions to fold. - Value *V = SimplifyInstruction(C, SQ); - if (V && LI->replacementPreservesLCSSAForm(C, V)) { - // If so, then delete the temporary instruction and stick the folded value - // in the map. - ValueMap[Inst] = V; - if (!C->mayHaveSideEffects()) { - C->deleteValue(); - C = nullptr; - } - } else { - ValueMap[Inst] = C; - } - if (C) { - // Otherwise, stick the new instruction into the new block! - C->setName(Inst->getName()); - C->insertBefore(LoopEntryBranch); - - if (auto *II = dyn_cast<IntrinsicInst>(C)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); - } - } - - // Along with all the other instructions, we just cloned OrigHeader's - // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's - // successors by duplicating their incoming values for OrigHeader. - TerminatorInst *TI = OrigHeader->getTerminator(); - for (BasicBlock *SuccBB : TI->successors()) - for (BasicBlock::iterator BI = SuccBB->begin(); - PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); - - // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove - // OrigPreHeader's old terminator (the original branch into the loop), and - // remove the corresponding incoming values from the PHI nodes in OrigHeader. - LoopEntryBranch->eraseFromParent(); - - - SmallVector<PHINode*, 2> InsertedPHIs; - // If there were any uses of instructions in the duplicated block outside the - // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, - &InsertedPHIs); - - // Attach dbg.value intrinsics to the new phis if that phi uses a value that - // previously had debug metadata attached. This keeps the debug info - // up-to-date in the loop body. - if (!InsertedPHIs.empty()) - insertDebugValues(OrigHeader, InsertedPHIs); - - // NewHeader is now the header of the loop. - L->moveToHeader(NewHeader); - assert(L->getHeader() == NewHeader && "Latch block is our new header"); - - // Inform DT about changes to the CFG. - if (DT) { - // The OrigPreheader branches to the NewHeader and Exit now. Then, inform - // the DT about the removed edge to the OrigHeader (that got removed). - SmallVector<DominatorTree::UpdateType, 3> Updates; - Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); - Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); - Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); - DT->applyUpdates(Updates); - } - - // At this point, we've finished our major CFG changes. As part of cloning - // the loop into the preheader we've simplified instructions and the - // duplicated conditional branch may now be branching on a constant. If it is - // branching on a constant and if that constant means that we enter the loop, - // then we fold away the cond branch to an uncond branch. This simplifies the - // loop in cases important for nested loops, and it also means we don't have - // to split as many edges. - BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); - assert(PHBI->isConditional() && "Should be clone of BI condbr!"); - if (!isa<ConstantInt>(PHBI->getCondition()) || - PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != - NewHeader) { - // The conditional branch can't be folded, handle the general case. - // Split edges as necessary to preserve LoopSimplify form. - - // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. - // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge( - OrigPreheader, NewHeader, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - NewPH->setName(NewHeader->getName() + ".lr.ph"); - - // Preserve canonical loop form, which means that 'Exit' should have only - // one predecessor. Note that Exit could be an exit block for multiple - // nested loops, causing both of the edges to now be critical and need to - // be split. - SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); - bool SplitLatchEdge = false; - for (BasicBlock *ExitPred : ExitPreds) { - // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(ExitPred); - if (!PredLoop || PredLoop->contains(Exit)) - continue; - if (isa<IndirectBrInst>(ExitPred->getTerminator())) - continue; - SplitLatchEdge |= L->getLoopLatch() == ExitPred; - BasicBlock *ExitSplit = SplitCriticalEdge( - ExitPred, Exit, - CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); - ExitSplit->moveBefore(Exit); - } - assert(SplitLatchEdge && - "Despite splitting all preds, failed to split latch exit?"); - } else { - // We can fold the conditional branch in the preheader, this makes things - // simpler. The first step is to remove the extra edge to the Exit block. - Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); - NewBI->setDebugLoc(PHBI->getDebugLoc()); - PHBI->eraseFromParent(); - - // With our CFG finalized, update DomTree if it is available. - if (DT) DT->deleteEdge(OrigPreheader, Exit); - } - - assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); - - // Now that the CFG and DomTree are in a consistent state again, try to merge - // the OrigHeader block into OrigLatch. This will succeed if they are - // connected by an unconditional branch. This is just a cleanup so the - // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, DT, LI); - - DEBUG(dbgs() << "LoopRotation: into "; L->dump()); - - ++NumRotated; - return true; -} - -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; - - if (!L->getExitingBlock()) - MultiExitLoop = true; - - for (BasicBlock::iterator I = Begin; I != End; ++I) { - - if (!isSafeToSpeculativelyExecute(&*I)) - return false; - - if (isa<DbgInfoIntrinsic>(I)) - continue; - - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - LLVM_FALLTHROUGH; - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = - !isa<Constant>(I->getOperand(0)) - ? I->getOperand(0) - : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast<Instruction>(UseI); - if (!L->contains(UserInst)) - return false; - } - } - - if (seenIncrement) - return false; - seenIncrement = true; - break; - } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; - } - } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - - BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; - - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; - - BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); - if (!BI) - return false; - - if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) - return false; - - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); - - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), - Latch->begin(), Jmp->getIterator()); - - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); - - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); - - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; -} - -/// Rotate \c L, and return true if any modification was made. -bool LoopRotate::processLoop(Loop *L) { - // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); - - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); - - bool MadeChange = rotateLoop(L, SimplifiedLatch); - assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && - "Loop latch should be exiting after loop-rotate."); - - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) - L->setLoopID(LoopMD); - - return MadeChange || SimplifiedLatch; -} - LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication) : EnableHeaderDuplication(EnableHeaderDuplication) {} @@ -646,10 +39,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0; const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); - LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, - SQ); - bool Changed = LR.processLoop(&L); + bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ, + false, Threshold, false); + if (!Changed) return PreservedAnalyses::all(); @@ -691,8 +84,8 @@ public: auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); - LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ); - return LR.processLoop(L); + return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize, + false); } }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 35c05e84fd68..2b83d3dc5f1b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -30,13 +30,16 @@ #include "llvm/IR/Dominators.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-simplifycfg" -static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) { +static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI, + ScalarEvolution &SE) { bool Changed = false; // Copy blocks into a temporary array to avoid iterator invalidation issues // as we remove them. @@ -53,11 +56,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) { if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L) continue; - // Pred is going to disappear, so we need to update the loop info. - if (L.getHeader() == Pred) - L.moveToHeader(Succ); - LI.removeBlock(Pred); - MergeBasicBlockIntoOnlyPred(Succ, &DT); + // Merge Succ into Pred and delete it. + MergeBlockIntoPredecessor(Succ, &DT, &LI); + + SE.forgetLoop(&L); Changed = true; } @@ -67,7 +69,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) { PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { - if (!simplifyLoopCFG(L, AR.DT, AR.LI)) + if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); @@ -87,7 +89,8 @@ public: DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - return simplifyLoopCFG(*L, DT, LI); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + return simplifyLoopCFG(*L, DT, LI, SE); } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp index 430a7085d93f..760177c9c5e9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -42,6 +42,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" @@ -49,7 +50,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -200,17 +200,19 @@ static bool sinkInstruction(Loop &L, Instruction &I, SmallVector<BasicBlock *, 2> SortedBBsToSinkInto; SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(), BBsToSinkInto.end()); - std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(), - [&](BasicBlock *A, BasicBlock *B) { - return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B); - }); + llvm::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(), + [&](BasicBlock *A, BasicBlock *B) { + return LoopBlockNumber.find(A)->second < + LoopBlockNumber.find(B)->second; + }); BasicBlock *MoveBB = *SortedBBsToSinkInto.begin(); // FIXME: Optimize the efficiency for cloned value replacement. The current // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()). - for (BasicBlock *N : SortedBBsToSinkInto) { - if (N == MoveBB) - continue; + for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) { + assert(LoopBlockNumber.find(N)->second > + LoopBlockNumber.find(MoveBB)->second && + "BBs not sorted!"); // Clone I and replace its uses. Instruction *IC = I.clone(); IC->setName(I.getName()); @@ -224,11 +226,11 @@ static bool sinkInstruction(Loop &L, Instruction &I, } // Replaces uses of I with IC in blocks dominated by N replaceDominatedUsesWith(&I, IC, DT, N); - DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() - << '\n'); + LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() + << '\n'); NumLoopSunkCloned++; } - DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); NumLoopSunk++; I.moveBefore(&*MoveBB->getFirstInsertionPt()); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index ff3e9eef16d9..fa83b48210bc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -75,6 +75,8 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -105,8 +107,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -121,7 +123,7 @@ using namespace llvm; #define DEBUG_TYPE "loop-reduce" -/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for +/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for /// bail out. This threshold is far beyond the number of users that LSR can /// conceivably solve, so it should not affect generated code, but catches the /// worst cases before LSR burns too much compile time and stack space. @@ -185,6 +187,8 @@ struct MemAccessTy { unsigned AS = UnknownAddressSpace) { return MemAccessTy(Type::getVoidTy(Ctx), AS); } + + Type *getType() { return MemTy; } }; /// This class holds data which is used to order reuse candidates. @@ -327,7 +331,7 @@ struct Formula { /// #2 enforces that 1 * reg is reg. /// #3 ensures invariant regs with respect to current loop can be combined /// together in LSR codegen. - /// This invariant can be temporarly broken while building a formula. + /// This invariant can be temporarily broken while building a formula. /// However, every formula inserted into the LSRInstance must be in canonical /// form. SmallVector<const SCEV *, 4> BaseRegs; @@ -442,7 +446,7 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { canonicalize(*L); } -/// \brief Check whether or not this formula statisfies the canonical +/// Check whether or not this formula satisfies the canonical /// representation. /// \see Formula::BaseRegs. bool Formula::isCanonical(const Loop &L) const { @@ -470,7 +474,7 @@ bool Formula::isCanonical(const Loop &L) const { return I == BaseRegs.end(); } -/// \brief Helper method to morph a formula into its canonical representation. +/// Helper method to morph a formula into its canonical representation. /// \see Formula::BaseRegs. /// Every formula having more than one base register, must use the ScaledReg /// field. Otherwise, we would have to do special cases everywhere in LSR @@ -505,7 +509,7 @@ void Formula::canonicalize(const Loop &L) { } } -/// \brief Get rid of the scale in the formula. +/// Get rid of the scale in the formula. /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. /// \return true if it was possible to get rid of the scale, false otherwise. /// \note After this operation the formula may not be in the canonical form. @@ -818,7 +822,7 @@ static bool isAddressUse(const TargetTransformInfo &TTI, /// Return the type of the memory being accessed. static MemAccessTy getAccessType(const TargetTransformInfo &TTI, - Instruction *Inst) { + Instruction *Inst, Value *OperandVal) { MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { AccessTy.MemTy = SI->getOperand(0)->getType(); @@ -832,7 +836,14 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI, } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { switch (II->getIntrinsicID()) { case Intrinsic::prefetch: + case Intrinsic::memset: AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); + AccessTy.MemTy = OperandVal->getType(); + break; + case Intrinsic::memmove: + case Intrinsic::memcpy: + AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace(); + AccessTy.MemTy = OperandVal->getType(); break; default: { MemIntrinsicInfo IntrInfo; @@ -937,7 +948,7 @@ static bool isHighCostExpansion(const SCEV *S, return true; } -/// If any of the instructions is the specified set are trivially dead, delete +/// If any of the instructions in the specified set are trivially dead, delete /// them and see if this makes any of their operands subsequently dead. static bool DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) { @@ -970,7 +981,7 @@ class LSRUse; } // end anonymous namespace -/// \brief Check if the addressing mode defined by \p F is completely +/// Check if the addressing mode defined by \p F is completely /// folded in \p LU at isel time. /// This includes address-mode folding and special icmp tricks. /// This function returns true if \p LU can accommodate what \p F @@ -1040,12 +1051,14 @@ private: void RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT); + ScalarEvolution &SE, DominatorTree &DT, + const TargetTransformInfo &TTI); void RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl<const SCEV *> *LoserRegs); + SmallPtrSetImpl<const SCEV *> *LoserRegs, + const TargetTransformInfo &TTI); }; /// An operand value in an instruction which is to be replaced with some @@ -1194,7 +1207,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, void Cost::RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { + ScalarEvolution &SE, DominatorTree &DT, + const TargetTransformInfo &TTI) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { // If this is an addrec for another loop, it should be an invariant // with respect to L since L is the innermost loop (at least @@ -1215,13 +1229,28 @@ void Cost::RateRegister(const SCEV *Reg, ++C.NumRegs; return; } - C.AddRecCost += 1; /// TODO: This should be a function of the stride. + + unsigned LoopCost = 1; + if (TTI.shouldFavorPostInc()) { + const SCEV *LoopStep = AR->getStepRecurrence(SE); + if (isa<SCEVConstant>(LoopStep)) { + // Check if a post-indexed load/store can be used. + if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || + TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + const SCEV *LoopStart = AR->getStart(); + if (!isa<SCEVConstant>(LoopStart) && + SE.isLoopInvariant(LoopStart, L)) + LoopCost = 0; + } + } + } + C.AddRecCost += LoopCost; // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { if (!Regs.count(AR->getOperand(1))) { - RateRegister(AR->getOperand(1), Regs, L, SE, DT); + RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI); if (isLoser()) return; } @@ -1249,13 +1278,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl<const SCEV *> *LoserRegs) { + SmallPtrSetImpl<const SCEV *> *LoserRegs, + const TargetTransformInfo &TTI) { if (LoserRegs && LoserRegs->count(Reg)) { Lose(); return; } if (Regs.insert(Reg).second) { - RateRegister(Reg, Regs, L, SE, DT); + RateRegister(Reg, Regs, L, SE, DT, TTI); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); } @@ -1279,7 +1309,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1288,7 +1318,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); + RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1343,14 +1373,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // If ICmpZero formula ends with not 0, it could not be replaced by // just add or sub. We'll need to compare final result of AddRec. - // That means we'll need an additional instruction. + // That means we'll need an additional instruction. But if the target can + // macro-fuse a compare with a branch, don't count this extra instruction. // For -10 + {0, +, 1}: // i = i + 1; // cmp i, 10 // // For {-10, +, 1}: // i = i + 1; - if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp()) C.Insns++; // Each new AddRec adds 1 instruction to calculation. C.Insns += (C.AddRecCost - PrevAddRecCost); @@ -1456,7 +1487,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. - std::sort(Key.begin(), Key.end()); + llvm::sort(Key.begin(), Key.end()); return Uniquifier.count(Key); } @@ -1480,7 +1511,7 @@ bool LSRUse::InsertFormula(const Formula &F, const Loop &L) { SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. - std::sort(Key.begin(), Key.end()); + llvm::sort(Key.begin(), Key.end()); if (!Uniquifier.insert(Key).second) return false; @@ -2384,24 +2415,27 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - MemAccessTy AccessTy = getAccessType(TTI, UI->getUser()); - int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, - /*BaseOffset=*/0, - /*HasBaseReg=*/false, Scale, - AccessTy.AddrSpace)) - goto decline_post_inc; - Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, - /*BaseOffset=*/0, - /*HasBaseReg=*/false, Scale, - AccessTy.AddrSpace)) - goto decline_post_inc; + if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) { + MemAccessTy AccessTy = getAccessType( + TTI, UI->getUser(), UI->getOperandValToReplace()); + int64_t Scale = C->getSExtValue(); + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) + goto decline_post_inc; + Scale = -Scale; + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) + goto decline_post_inc; + } } } - DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " - << *Cond << '\n'); + LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " + << *Cond << '\n'); // It's possible for the setcc instruction to be anywhere in the loop, and // possible for it to have multiple users. If it is not immediately before @@ -2642,7 +2676,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() { if (Types.size() == 1) Types.clear(); - DEBUG(print_factors_and_types(dbgs())); + LLVM_DEBUG(print_factors_and_types(dbgs())); } /// Helper for CollectChains that finds an IV operand (computed by an AddRec in @@ -2666,7 +2700,7 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE, return OI; } -/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in +/// IVChain logic must consistently peek base TruncInst operands, so wrap it in /// a convenient helper. static Value *getWideOperand(Value *Oper) { if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) @@ -2773,10 +2807,9 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, return false; if (!Users.empty()) { - DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; - for (Instruction *Inst : Users) { - dbgs() << " " << *Inst << "\n"; - }); + LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; + for (Instruction *Inst + : Users) { dbgs() << " " << *Inst << "\n"; }); return false; } assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); @@ -2829,8 +2862,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, // the stride. cost -= NumReusedIncrements; - DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost - << "\n"); + LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost + << "\n"); return cost < 0; } @@ -2883,7 +2916,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, if (isa<PHINode>(UserInst)) return; if (NChains >= MaxChains && !StressIVChain) { - DEBUG(dbgs() << "IV Chain Limit\n"); + LLVM_DEBUG(dbgs() << "IV Chain Limit\n"); return; } LastIncExpr = OperExpr; @@ -2896,11 +2929,11 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), OperExprBase)); ChainUsersVec.resize(NChains); - DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst - << ") IV=" << *LastIncExpr << "\n"); + LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst + << ") IV=" << *LastIncExpr << "\n"); } else { - DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst - << ") IV+" << *LastIncExpr << "\n"); + LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst + << ") IV+" << *LastIncExpr << "\n"); // Add this IV user to the end of the chain. IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); } @@ -2970,7 +3003,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, /// loop latch. This will discover chains on side paths, but requires /// maintaining multiple copies of the Chains state. void LSRInstance::CollectChains() { - DEBUG(dbgs() << "Collecting IV Chains.\n"); + LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n"); SmallVector<ChainUsers, 8> ChainUsersVec; SmallVector<BasicBlock *,8> LatchPath; @@ -3039,10 +3072,10 @@ void LSRInstance::CollectChains() { void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); - DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); + LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); for (const IVInc &Inc : Chain) { - DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); + LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand"); IVIncSet.insert(UseI); @@ -3059,7 +3092,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, if (IncConst->getAPInt().getMinSignedBits() > 64) return false; - MemAccessTy AccessTy = getAccessType(TTI, UserInst); + MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, IncOffset, /*HaseBaseReg=*/false)) @@ -3099,11 +3132,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, } if (IVOpIter == IVOpEnd) { // Gracefully give up on this chain. - DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); + LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); return; } - DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); + LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); const SCEV *LeftOverExpr = nullptr; @@ -3179,7 +3212,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { find(UserInst->operands(), U.getOperandValToReplace()); assert(UseI != UserInst->op_end() && "cannot find IV operand"); if (IVIncSet.count(UseI)) { - DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); + LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); continue; } @@ -3187,7 +3220,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { MemAccessTy AccessTy; if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; - AccessTy = getAccessType(TTI, UserInst); + AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace()); } const SCEV *S = IU.getExpr(U); @@ -3255,7 +3288,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { } } - DEBUG(print_fixups(dbgs())); + LLVM_DEBUG(print_fixups(dbgs())); } /// Insert a formula for the given expression into the given use, separating out @@ -3464,12 +3497,45 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, return S; } -/// \brief Helper function for LSRInstance::GenerateReassociations. +/// Return true if the SCEV represents a value that may end up as a +/// post-increment operation. +static bool mayUsePostIncMode(const TargetTransformInfo &TTI, + LSRUse &LU, const SCEV *S, const Loop *L, + ScalarEvolution &SE) { + if (LU.Kind != LSRUse::Address || + !LU.AccessTy.getType()->isIntOrIntVectorTy()) + return false; + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); + if (!AR) + return false; + const SCEV *LoopStep = AR->getStepRecurrence(SE); + if (!isa<SCEVConstant>(LoopStep)) + return false; + if (LU.AccessTy.getType()->getScalarSizeInBits() != + LoopStep->getType()->getScalarSizeInBits()) + return false; + // Check if a post-indexed load/store can be used. + if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || + TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + const SCEV *LoopStart = AR->getStart(); + if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L)) + return true; + } + return false; +} + +/// Helper function for LSRInstance::GenerateReassociations. void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, const Formula &Base, unsigned Depth, size_t Idx, bool IsScaledReg) { const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + // Don't generate reassociations for the base register of a value that + // may generate a post-increment operator. The reason is that the + // reassociations cause extra base+register formula to be created, + // and possibly chosen, but the post-increment is more efficient. + if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE)) + return; SmallVector<const SCEV *, 8> AddOps; const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); if (Remainder) @@ -3542,7 +3608,12 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like // it. - GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1); + // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2) + // Because just Depth is not enough to bound compile time. + // This means that every time AddOps.size() is greater 16^x we will add + // x to Depth. + GenerateReassociations(LU, LUIdx, LU.Formulae.back(), + Depth + 1 + (Log2_32(AddOps.size()) >> 2)); } } @@ -3596,7 +3667,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, } } -/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets. +/// Helper function for LSRInstance::GenerateSymbolicOffsets. void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, const Formula &Base, size_t Idx, bool IsScaledReg) { @@ -3628,7 +3699,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, /* IsScaledReg */ true); } -/// \brief Helper function for LSRInstance::GenerateConstantOffsets. +/// Helper function for LSRInstance::GenerateConstantOffsets. void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { @@ -3938,10 +4009,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (Imms.size() == 1) continue; - DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; - for (const auto &Entry : Imms) - dbgs() << ' ' << Entry.first; - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; + for (const auto &Entry + : Imms) dbgs() + << ' ' << Entry.first; + dbgs() << '\n'); // Examine each offset. for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); @@ -3953,7 +4025,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (!isa<SCEVConstant>(OrigReg) && UsedByIndicesMap[Reg].count() == 1) { - DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); + LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg + << '\n'); continue; } @@ -4038,6 +4111,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { + if (TTI.shouldFavorPostInc() && + mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) + continue; if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) continue; NewF = F; @@ -4099,9 +4175,9 @@ LSRInstance::GenerateAllReuseFormulae() { GenerateCrossUseConstantOffsets(); - DEBUG(dbgs() << "\n" - "After generating reuse formulae:\n"; - print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "\n" + "After generating reuse formulae:\n"; + print_uses(dbgs())); } /// If there are multiple formulae with the same set of registers used @@ -4123,7 +4199,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; - DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); + dbgs() << '\n'); bool Any = false; for (size_t FIdx = 0, NumForms = LU.Formulae.size(); @@ -4147,8 +4224,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // as the basis of rediscovering the desired formula that uses an AddRec // corresponding to the existing phi. Once all formulae have been // generated, these initial losers may be pruned. - DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); - dbgs() << "\n"); + LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); + dbgs() << "\n"); } else { SmallVector<const SCEV *, 4> Key; @@ -4161,7 +4238,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for // uniquifying. - std::sort(Key.begin(), Key.end()); + llvm::sort(Key.begin(), Key.end()); std::pair<BestFormulaeTy::const_iterator, bool> P = BestFormulae.insert(std::make_pair(Key, FIdx)); @@ -4175,10 +4252,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); if (CostF.isLess(CostBest, TTI)) std::swap(F, Best); - DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); - dbgs() << "\n" - " in favor of formula "; Best.print(dbgs()); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; + Best.print(dbgs()); dbgs() << '\n'); } #ifndef NDEBUG ChangedFormulae = true; @@ -4197,11 +4274,11 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { BestFormulae.clear(); } - DEBUG(if (ChangedFormulae) { - dbgs() << "\n" - "After filtering out undesirable candidates:\n"; - print_uses(dbgs()); - }); + LLVM_DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); } // This is a rough guess that seems to work fairly well. @@ -4230,11 +4307,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { /// register pressure); remove it to simplify the system. void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { - DEBUG(dbgs() << "The search space is too complex.\n"); + LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); - DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " - "which use a superset of registers used by other " - "formulae.\n"); + LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " + "which use a superset of registers used by other " + "formulae.\n"); for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; @@ -4252,7 +4329,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { - DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); LU.DeleteFormula(F); --i; --e; @@ -4267,8 +4345,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { - DEBUG(dbgs() << " Deleting "; F.print(dbgs()); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); LU.DeleteFormula(F); --i; --e; @@ -4283,8 +4361,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { LU.RecomputeRegs(LUIdx, RegUses); } - DEBUG(dbgs() << "After pre-selection:\n"; - print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } } @@ -4294,9 +4371,10 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; - DEBUG(dbgs() << "The search space is too complex.\n" - "Narrowing the search space by assuming that uses separated " - "by a constant offset will use the same registers.\n"); + LLVM_DEBUG( + dbgs() << "The search space is too complex.\n" + "Narrowing the search space by assuming that uses separated " + "by a constant offset will use the same registers.\n"); // This is especially useful for unrolled loops. @@ -4314,7 +4392,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LU.Kind, LU.AccessTy)) continue; - DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; @@ -4322,7 +4400,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (LSRFixup &Fixup : LU.Fixups) { Fixup.Offset += F.BaseOffset; LUThatHas->pushFixup(Fixup); - DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); + LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); } // Delete formulae from the new use which are no longer legal. @@ -4331,8 +4409,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { Formula &F = LUThatHas->Formulae[i]; if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, LUThatHas->Kind, LUThatHas->AccessTy, F)) { - DEBUG(dbgs() << " Deleting "; F.print(dbgs()); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LUThatHas->DeleteFormula(F); --i; --e; @@ -4351,7 +4428,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { } } - DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that @@ -4359,15 +4436,14 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { /// eliminate. void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { - DEBUG(dbgs() << "The search space is too complex.\n"); + LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); - DEBUG(dbgs() << "Narrowing the search space by re-filtering out " - "undesirable dedicated registers.\n"); + LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out " + "undesirable dedicated registers.\n"); FilterOutUndesirableDedicatedRegisters(); - DEBUG(dbgs() << "After pre-selection:\n"; - print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } } @@ -4378,15 +4454,16 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ /// The benefit is that it is more likely to find out a better solution /// from a formulae set with more Scale and ScaledReg variations than /// a formulae set with the same Scale and ScaledReg. The picking winner -/// reg heurstic will often keep the formulae with the same Scale and +/// reg heuristic will often keep the formulae with the same Scale and /// ScaledReg and filter others, and we want to avoid that if possible. void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; - DEBUG(dbgs() << "The search space is too complex.\n" - "Narrowing the search space by choosing the best Formula " - "from the Formulae with the same Scale and ScaledReg.\n"); + LLVM_DEBUG( + dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the best Formula " + "from the Formulae with the same Scale and ScaledReg.\n"); // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>; @@ -4400,7 +4477,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; - DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); + dbgs() << '\n'); // Return true if Formula FA is better than Formula FB. auto IsBetterThan = [&](Formula &FA, Formula &FB) { @@ -4444,10 +4522,10 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { Formula &Best = LU.Formulae[P.first->second]; if (IsBetterThan(F, Best)) std::swap(F, Best); - DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); - dbgs() << "\n" - " in favor of formula "; - Best.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; + Best.print(dbgs()); dbgs() << '\n'); #ifndef NDEBUG ChangedFormulae = true; #endif @@ -4463,7 +4541,7 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { BestFormulae.clear(); } - DEBUG(if (ChangedFormulae) { + LLVM_DEBUG(if (ChangedFormulae) { dbgs() << "\n" "After filtering out undesirable candidates:\n"; print_uses(dbgs()); @@ -4522,7 +4600,7 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { // Used in each formula of a solution (in example above this is reg(c)). // We can skip them in calculations. SmallPtrSet<const SCEV *, 4> UniqRegs; - DEBUG(dbgs() << "The search space is too complex.\n"); + LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); // Map each register to probability of not selecting DenseMap <const SCEV *, float> RegNumMap; @@ -4542,7 +4620,8 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { RegNumMap.insert(std::make_pair(Reg, PNotSel)); } - DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n"); + LLVM_DEBUG( + dbgs() << "Narrowing the search space by deleting costly formulas\n"); // Delete formulas where registers number expectation is high. for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { @@ -4584,26 +4663,25 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { MinIdx = i; } } - DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs()); - dbgs() << " with min reg num " << FMinRegNum << '\n'); + LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs()); + dbgs() << " with min reg num " << FMinRegNum << '\n'); if (MinIdx != 0) std::swap(LU.Formulae[MinIdx], LU.Formulae[0]); while (LU.Formulae.size() != 1) { - DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs()); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs()); + dbgs() << '\n'); LU.Formulae.pop_back(); } LU.RecomputeRegs(LUIdx, RegUses); assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula"); Formula &F = LU.Formulae[0]; - DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n'); // When we choose the formula, the regs become unique. UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); if (F.ScaledReg) UniqRegs.insert(F.ScaledReg); } - DEBUG(dbgs() << "After pre-selection:\n"; - print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } /// Pick a register which seems likely to be profitable, and then in any use @@ -4616,7 +4694,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { // Ok, we have too many of formulae on our hands to conveniently handle. // Use a rough heuristic to thin out the list. - DEBUG(dbgs() << "The search space is too complex.\n"); + LLVM_DEBUG(dbgs() << "The search space is too complex.\n"); // Pick the register which is used by the most LSRUses, which is likely // to be a good reuse register candidate. @@ -4637,8 +4715,8 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } - DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best - << " will yield profitable reuse.\n"); + LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best + << " will yield profitable reuse.\n"); Taken.insert(Best); // In any use with formulae which references this register, delete formulae @@ -4651,7 +4729,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { Formula &F = LU.Formulae[i]; if (!F.referencesReg(Best)) { - DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LU.DeleteFormula(F); --e; --i; @@ -4665,8 +4743,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { LU.RecomputeRegs(LUIdx, RegUses); } - DEBUG(dbgs() << "After pre-selection:\n"; - print_uses(dbgs())); + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } } @@ -4748,11 +4825,11 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, if (F.getNumRegs() == 1 && Workspace.size() == 1) VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); } else { - DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); - dbgs() << ".\n Regs:"; - for (const SCEV *S : NewRegs) - dbgs() << ' ' << *S; - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); + dbgs() << ".\n Regs:"; for (const SCEV *S + : NewRegs) dbgs() + << ' ' << *S; + dbgs() << '\n'); SolutionCost = NewCost; Solution = Workspace; @@ -4777,22 +4854,22 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SolveRecurse(Solution, SolutionCost, Workspace, CurCost, CurRegs, VisitedRegs); if (Solution.empty()) { - DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); + LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); return; } // Ok, we've now made all our decisions. - DEBUG(dbgs() << "\n" - "The chosen solution requires "; SolutionCost.print(dbgs()); - dbgs() << ":\n"; - for (size_t i = 0, e = Uses.size(); i != e; ++i) { - dbgs() << " "; - Uses[i].print(dbgs()); - dbgs() << "\n" - " "; - Solution[i]->print(dbgs()); - dbgs() << '\n'; - }); + LLVM_DEBUG(dbgs() << "\n" + "The chosen solution requires "; + SolutionCost.print(dbgs()); dbgs() << ":\n"; + for (size_t i = 0, e = Uses.size(); i != e; ++i) { + dbgs() << " "; + Uses[i].print(dbgs()); + dbgs() << "\n" + " "; + Solution[i]->print(dbgs()); + dbgs() << '\n'; + }); assert(Solution.size() == Uses.size() && "Malformed solution!"); } @@ -4993,7 +5070,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -5266,7 +5343,8 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, for (const IVStrideUse &U : IU) { if (++NumUsers > MaxIVUsers) { (void)U; - DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n"); + LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U + << "\n"); return; } // Bail out if we have a PHI on an EHPad that gets a value from a @@ -5299,9 +5377,9 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, } #endif // DEBUG - DEBUG(dbgs() << "\nLSR on loop "; - L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); - dbgs() << ":\n"); + LLVM_DEBUG(dbgs() << "\nLSR on loop "; + L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); + dbgs() << ":\n"); // First, perform some low-level loop optimizations. OptimizeShadowIV(); @@ -5312,7 +5390,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, // Skip nested loops until we can model them better with formulae. if (!L->empty()) { - DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); + LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); return; } @@ -5322,9 +5400,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, CollectFixupsAndInitialFormulae(); CollectLoopInvariantFixupsAndFormulae(); - assert(!Uses.empty() && "IVUsers reported at least one use"); - DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; - print_uses(dbgs())); + if (Uses.empty()) + return; + + LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; + print_uses(dbgs())); // Now use the reuse data to generate a bunch of interesting ways // to formulate the values needed for the uses. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp new file mode 100644 index 000000000000..86c99aed4417 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -0,0 +1,447 @@ +//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements an unroll and jam pass. Most of the work is done by +// Utils/UnrollLoopAndJam.cpp. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <string> + +using namespace llvm; + +#define DEBUG_TYPE "loop-unroll-and-jam" + +static cl::opt<bool> + AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden, + cl::desc("Allows loops to be unroll-and-jammed.")); + +static cl::opt<unsigned> UnrollAndJamCount( + "unroll-and-jam-count", cl::Hidden, + cl::desc("Use this unroll count for all loops including those with " + "unroll_and_jam_count pragma values, for testing purposes")); + +static cl::opt<unsigned> UnrollAndJamThreshold( + "unroll-and-jam-threshold", cl::init(60), cl::Hidden, + cl::desc("Threshold to use for inner loop when doing unroll and jam.")); + +static cl::opt<unsigned> PragmaUnrollAndJamThreshold( + "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or " + "unroll_count pragma.")); + +// Returns the loop hint metadata node with the given name (for example, +// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is +// returned. +static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) { + if (MDNode *LoopID = L->getLoopID()) + return GetUnrollMetadata(LoopID, Name); + return nullptr; +} + +// Returns true if the loop has any metadata starting with Prefix. For example a +// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata. +static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) { + if (MDNode *LoopID = L->getLoopID()) { + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) + continue; + + MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + + if (S->getString().startswith(Prefix)) + return true; + } + } + return false; +} + +// Returns true if the loop has an unroll_and_jam(enable) pragma. +static bool HasUnrollAndJamEnablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable"); +} + +// Returns true if the loop has an unroll_and_jam(disable) pragma. +static bool HasUnrollAndJamDisablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable"); +} + +// If loop has an unroll_and_jam_count pragma return the (necessarily +// positive) value from the pragma. Otherwise return 0. +static unsigned UnrollAndJamCountPragmaValue(const Loop *L) { + MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count"); + if (MD) { + assert(MD->getNumOperands() == 2 && + "Unroll count hint metadata should have two operands."); + unsigned Count = + mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); + assert(Count >= 1 && "Unroll count must be positive."); + return Count; + } + return 0; +} + +// Returns loop size estimation for unrolled loop. +static uint64_t +getUnrollAndJammedLoopSize(unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP) { + assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); + return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; +} + +// Calculates unroll and jam count and writes it to UP.Count. Returns true if +// unroll count was set explicitly. +static bool computeUnrollAndJamCount( + Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT, + LoopInfo *LI, ScalarEvolution &SE, + const SmallPtrSetImpl<const Value *> &EphValues, + OptimizationRemarkEmitter *ORE, unsigned OuterTripCount, + unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount, + unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) { + // Check for explicit Count from the "unroll-and-jam-count" option. + bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0; + if (UserUnrollCount) { + UP.Count = UnrollAndJamCount; + UP.Force = true; + if (UP.AllowRemainder && + getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold && + getUnrollAndJammedLoopSize(InnerLoopSize, UP) < + UP.UnrollAndJamInnerLoopThreshold) + return true; + } + + // Check for unroll_and_jam pragmas + unsigned PragmaCount = UnrollAndJamCountPragmaValue(L); + if (PragmaCount > 0) { + UP.Count = PragmaCount; + UP.Runtime = true; + UP.Force = true; + if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) && + getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold && + getUnrollAndJammedLoopSize(InnerLoopSize, UP) < + UP.UnrollAndJamInnerLoopThreshold) + return true; + } + + // Use computeUnrollCount from the loop unroller to get a sensible count + // for the unrolling the outer loop. This uses UP.Threshold / + // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values. + // We have already checked that the loop has no unroll.* pragmas. + unsigned MaxTripCount = 0; + bool UseUpperBound = false; + bool ExplicitUnroll = computeUnrollCount( + L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, + OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); + if (ExplicitUnroll || UseUpperBound) { + // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it + // for the unroller instead. + UP.Count = 0; + return false; + } + + bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L); + ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount; + + // If the loop has an unrolling pragma, we want to be more aggressive with + // unrolling limits. + if (ExplicitUnroll && OuterTripCount != 0) + UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold; + + if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= + UP.UnrollAndJamInnerLoopThreshold) { + UP.Count = 0; + return false; + } + + // If the inner loop count is known and small, leave the entire loop nest to + // be the unroller + if (!ExplicitUnroll && InnerTripCount && + InnerLoopSize * InnerTripCount < UP.Threshold) { + UP.Count = 0; + return false; + } + + // We have a sensible limit for the outer loop, now adjust it for the inner + // loop and UP.UnrollAndJamInnerLoopThreshold. + while (UP.Count != 0 && UP.AllowRemainder && + getUnrollAndJammedLoopSize(InnerLoopSize, UP) >= + UP.UnrollAndJamInnerLoopThreshold) + UP.Count--; + + if (!ExplicitUnroll) { + // Check for situations where UnJ is likely to be unprofitable. Including + // subloops with more than 1 block. + if (SubLoop->getBlocks().size() != 1) { + UP.Count = 0; + return false; + } + + // Limit to loops where there is something to gain from unrolling and + // jamming the loop. In this case, look for loads that are invariant in the + // outer loop and can become shared. + unsigned NumInvariant = 0; + for (BasicBlock *BB : SubLoop->getBlocks()) { + for (Instruction &I : *BB) { + if (auto *Ld = dyn_cast<LoadInst>(&I)) { + Value *V = Ld->getPointerOperand(); + const SCEV *LSCEV = SE.getSCEVAtScope(V, L); + if (SE.isLoopInvariant(LSCEV, L)) + NumInvariant++; + } + } + } + if (NumInvariant == 0) { + UP.Count = 0; + return false; + } + } + + return ExplicitUnroll; +} + +static LoopUnrollResult +tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + AssumptionCache &AC, DependenceInfo &DI, + OptimizationRemarkEmitter &ORE, int OptLevel) { + // Quick checks of the correct loop form + if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) + return LoopUnrollResult::Unmodified; + Loop *SubLoop = L->getSubLoops()[0]; + if (!SubLoop->isLoopSimplifyForm()) + return LoopUnrollResult::Unmodified; + + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Exit = L->getExitingBlock(); + BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); + BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); + + if (Latch != Exit || SubLoopLatch != SubLoopExit) + return LoopUnrollResult::Unmodified; + + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, SE, TTI, OptLevel, None, None, None, None, None, None); + if (AllowUnrollAndJam.getNumOccurrences() > 0) + UP.UnrollAndJam = AllowUnrollAndJam; + if (UnrollAndJamThreshold.getNumOccurrences() > 0) + UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold; + // Exit early if unrolling is disabled. + if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0) + return LoopUnrollResult::Unmodified; + + LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F[" + << L->getHeader()->getParent()->getName() << "] Loop %" + << L->getHeader()->getName() << "\n"); + + // A loop with any unroll pragma (enabling/disabling/count/etc) is left for + // the unroller, so long as it does not explicitly have unroll_and_jam + // metadata. This means #pragma nounroll will disable unroll and jam as well + // as unrolling + if (HasUnrollAndJamDisablePragma(L) || + (HasAnyUnrollPragma(L, "llvm.loop.unroll.") && + !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) { + LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n"); + return LoopUnrollResult::Unmodified; + } + + if (!isSafeToUnrollAndJam(L, SE, DT, DI)) { + LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n"); + return LoopUnrollResult::Unmodified; + } + + // Approximate the loop size and collect useful info + unsigned NumInlineCandidates; + bool NotDuplicatable; + bool Convergent; + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + unsigned InnerLoopSize = + ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, + Convergent, TTI, EphValues, UP.BEInsns); + unsigned OuterLoopSize = + ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, + TTI, EphValues, UP.BEInsns); + LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n"); + LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n"); + if (NotDuplicatable) { + LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable " + "instructions.\n"); + return LoopUnrollResult::Unmodified; + } + if (NumInlineCandidates != 0) { + LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return LoopUnrollResult::Unmodified; + } + if (Convergent) { + LLVM_DEBUG( + dbgs() << " Not unrolling loop with convergent instructions.\n"); + return LoopUnrollResult::Unmodified; + } + + // Find trip count and trip multiple + unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); + unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); + unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch); + + // Decide if, and by how much, to unroll + bool IsCountSetExplicitly = computeUnrollAndJamCount( + L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount, + OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP); + if (UP.Count <= 1) + return LoopUnrollResult::Unmodified; + // Unroll factor (Count) must be less or equal to TripCount. + if (OuterTripCount && UP.Count > OuterTripCount) + UP.Count = OuterTripCount; + + LoopUnrollResult UnrollResult = + UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple, + UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE); + + // If loop has an unroll count pragma or unrolled by explicitly set count + // mark loop as unrolled to prevent unrolling beyond that requested. + if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly) + L->setLoopAlreadyUnrolled(); + + return UnrollResult; +} + +namespace { + +class LoopUnrollAndJam : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + unsigned OptLevel; + + LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) { + initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + Function &F = *L->getHeader()->getParent(); + + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); + // For the old PM, we can't use OptimizationRemarkEmitter as an analysis + // pass. Function analyses need to be preserved across loop transformations + // but ORE cannot be preserved (see comment before the pass definition). + OptimizationRemarkEmitter ORE(&F); + + LoopUnrollResult Result = + tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); + + if (Result == LoopUnrollResult::FullyUnrolled) + LPM.markLoopAsDeleted(*L); + + return Result != LoopUnrollResult::Unmodified; + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<DependenceAnalysisWrapperPass>(); + getLoopAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char LoopUnrollAndJam::ID = 0; + +INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam", + "Unroll and Jam loops", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam", + "Unroll and Jam loops", false, false) + +Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) { + return new LoopUnrollAndJam(OptLevel); +} + +PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); + Function *F = L.getHeader()->getParent(); + + auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); + // FIXME: This should probably be optional rather than required. + if (!ORE) + report_fatal_error( + "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at " + "a higher level"); + + DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); + + LoopUnrollResult Result = tryToUnrollAndJamLoop( + &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel); + + if (Result == LoopUnrollResult::Unmodified) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 15e7da5e1a7a..634215c9770f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -53,6 +53,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" @@ -164,7 +165,7 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max(); /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. -static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( +TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, @@ -191,6 +192,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.Force = false; UP.UpperBound = false; UP.AllowPeeling = true; + UP.UnrollAndJam = false; + UP.UnrollAndJamInnerLoopThreshold = 60; // Override with any target specific settings TTI.getUnrollingPreferences(L, SE, UP); @@ -285,17 +288,17 @@ struct UnrolledInstStateKeyInfo { }; struct EstimatedUnrollCost { - /// \brief The estimated cost after unrolling. + /// The estimated cost after unrolling. unsigned UnrolledCost; - /// \brief The estimated dynamic cost of executing the instructions in the + /// The estimated dynamic cost of executing the instructions in the /// rolled form. unsigned RolledDynamicCost; }; } // end anonymous namespace -/// \brief Figure out if the loop is worth full unrolling. +/// Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know /// if that would expose any further optimization opportunities. This routine @@ -308,10 +311,10 @@ struct EstimatedUnrollCost { /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -static Optional<EstimatedUnrollCost> -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, - ScalarEvolution &SE, const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { +static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost( + const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, + const SmallPtrSetImpl<const Value *> &EphValues, + const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -405,9 +408,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // First accumulate the cost of this instruction. if (!Cost.IsFree) { UnrolledCost += TTI.getUserCost(I); - DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration - << "): "); - DEBUG(I->dump()); + LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration " + << Iteration << "): "); + LLVM_DEBUG(I->dump()); } // We must count the cost of every operand which is not free, @@ -442,14 +445,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, assert(L->isLCSSAForm(DT) && "Must have loops in LCSSA form to track live-out values."); - DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); + LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { - DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); // Prepare for the iteration by collecting any simplified entry or backedge // inputs. @@ -490,7 +493,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - if (isa<DbgInfoIntrinsic>(I)) + // These won't get into the final code - don't even try calculating the + // cost for them. + if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I)) continue; // Track this instruction's expected baseline cost when executing the @@ -512,8 +517,13 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. - if(isa<CallInst>(&I)) - return None; + if (auto *CI = dyn_cast<CallInst>(&I)) { + const Function *Callee = CI->getCalledFunction(); + if (!Callee || TTI.isLoweredToCall(Callee)) { + LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n"); + return None; + } + } // If the instruction might have a side-effect recursively account for // the cost of it and all the instructions leading up to it. @@ -522,10 +532,10 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { - DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" - << " UnrolledCost: " << UnrolledCost - << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize - << "\n"); + LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); return None; } } @@ -578,8 +588,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. if (UnrolledCost == RolledDynamicCost) { - DEBUG(dbgs() << " No opportunities found.. exiting.\n" - << " UnrolledCost: " << UnrolledCost << "\n"); + LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n" + << " UnrolledCost: " << UnrolledCost << "\n"); return None; } } @@ -600,20 +610,17 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, } } - DEBUG(dbgs() << "Analysis finished:\n" - << "UnrolledCost: " << UnrolledCost << ", " - << "RolledDynamicCost: " << RolledDynamicCost << "\n"); + LLVM_DEBUG(dbgs() << "Analysis finished:\n" + << "UnrolledCost: " << UnrolledCost << ", " + << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; } /// ApproximateLoopSize - Approximate the size of the loop. -static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, bool &Convergent, - const TargetTransformInfo &TTI, - AssumptionCache *AC, unsigned BEInsns) { - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - +unsigned llvm::ApproximateLoopSize( + const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, + const TargetTransformInfo &TTI, + const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { CodeMetrics Metrics; for (BasicBlock *BB : L->blocks()) Metrics.analyzeBasicBlock(BB, TTI, EphValues); @@ -706,10 +713,11 @@ static uint64_t getUnrolledLoopSize( // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. -static bool computeUnrollCount( +bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, - unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, + ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, + OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. @@ -729,7 +737,7 @@ static bool computeUnrollCount( UP.Runtime = true; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && + if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) return true; } @@ -746,8 +754,8 @@ static bool computeUnrollCount( if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with - // unrolling limits. Set thresholds to at least the PragmaThreshold value - // which is larger than the default limits. + // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold + // value which is larger than the default limits. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); UP.PartialThreshold = std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); @@ -763,7 +771,7 @@ static bool computeUnrollCount( // compute the former when the latter is zero. unsigned ExactTripCount = TripCount; assert((ExactTripCount == 0 || MaxTripCount == 0) && - "ExtractTripCound and MaxTripCount cannot both be non zero."); + "ExtractTripCount and MaxTripCount cannot both be non zero."); unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; UP.Count = FullUnrollTripCount; if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { @@ -779,7 +787,7 @@ static bool computeUnrollCount( // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, TTI, + L, FullUnrollTripCount, DT, SE, EphValues, TTI, UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); @@ -794,7 +802,7 @@ static bool computeUnrollCount( } // 4th priority is loop peeling - computePeelCount(L, LoopSize, UP, TripCount); + computePeelCount(L, LoopSize, UP, TripCount, SE); if (UP.PeelCount) { UP.Runtime = false; UP.Count = 1; @@ -802,12 +810,12 @@ static bool computeUnrollCount( } // 5th priority is partial unrolling. - // Try partial unroll only when TripCount could be staticaly calculated. + // Try partial unroll only when TripCount could be statically calculated. if (TripCount) { UP.Partial |= ExplicitUnroll; if (!UP.Partial) { - DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); + LLVM_DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); UP.Count = 0; return false; } @@ -894,8 +902,9 @@ static bool computeUnrollCount( // Reduce count based on the type of unrolling and the threshold values. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; if (!UP.Runtime) { - DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " - << "-unroll-runtime not given\n"); + LLVM_DEBUG( + dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); UP.Count = 0; return false; } @@ -915,12 +924,13 @@ static bool computeUnrollCount( if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) { while (UP.Count != 0 && TripMultiple % UP.Count != 0) UP.Count >>= 1; - DEBUG(dbgs() << "Remainder loop is restricted (that could architecture " - "specific or because the loop contains a convergent " - "instruction), so unroll count must divide the trip " - "multiple, " - << TripMultiple << ". Reducing unroll count from " - << OrigCount << " to " << UP.Count << ".\n"); + LLVM_DEBUG( + dbgs() << "Remainder loop is restricted (that could architecture " + "specific or because the loop contains a convergent " + "instruction), so unroll count must divide the trip " + "multiple, " + << TripMultiple << ". Reducing unroll count from " << OrigCount + << " to " << UP.Count << ".\n"); using namespace ore; @@ -942,7 +952,8 @@ static bool computeUnrollCount( if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; - DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + << "\n"); if (UP.Count < 2) UP.Count = 0; return ExplicitUnroll; @@ -955,12 +966,13 @@ static LoopUnrollResult tryToUnrollLoop( Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) { - DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() - << "] Loop %" << L->getHeader()->getName() << "\n"); + LLVM_DEBUG(dbgs() << "Loop Unroll: F[" + << L->getHeader()->getParent()->getName() << "] Loop %" + << L->getHeader()->getName() << "\n"); if (HasUnrollDisablePragma(L)) return LoopUnrollResult::Unmodified; if (!L->isLoopSimplifyForm()) { - DEBUG( + LLVM_DEBUG( dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); return LoopUnrollResult::Unmodified; } @@ -975,16 +987,21 @@ static LoopUnrollResult tryToUnrollLoop( // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) return LoopUnrollResult::Unmodified; - unsigned LoopSize = ApproximateLoopSize( - L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + + unsigned LoopSize = + ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, + TTI, EphValues, UP.BEInsns); + LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NotDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" - << " instructions.\n"); + LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); return LoopUnrollResult::Unmodified; } if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } @@ -1030,7 +1047,7 @@ static LoopUnrollResult tryToUnrollLoop( // loop tests remains the same compared to the non-unrolled version, whereas // the generic upper bound unrolling keeps all but the last loop test so the // number of loop tests goes up which may end up being worse on targets with - // constriained branch predictor resources so is controlled by an option.) + // constrained branch predictor resources so is controlled by an option.) // In addition we only unroll small upper bounds. if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) { MaxTripCount = 0; @@ -1040,9 +1057,9 @@ static LoopUnrollResult tryToUnrollLoop( // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; - bool IsCountSetExplicitly = - computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, - TripMultiple, LoopSize, UP, UseUpperBound); + bool IsCountSetExplicitly = computeUnrollCount( + L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; // Unroll factor (Count) must be less or equal to TripCount. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index f2405d9b0c03..b12586758925 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -28,7 +28,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" @@ -39,6 +39,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" @@ -66,7 +67,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> @@ -298,9 +298,9 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; if (Metrics.notDuplicatable) { - DEBUG(dbgs() << "NOT unswitching loop %" - << L->getHeader()->getName() << ", contents cannot be " - << "duplicated!\n"); + LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName() + << ", contents cannot be " + << "duplicated!\n"); return false; } } @@ -635,6 +635,12 @@ bool LoopUnswitch::processCurrentLoop() { return true; } + // Do not do non-trivial unswitch while optimizing for size. + // FIXME: Use Function::optForSize(). + if (OptimizeForSize || + loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) + return false; + // Run through the instructions in the loop, keeping track of three things: // // - That we do not unswitch loops containing convergent operations, as we @@ -666,12 +672,6 @@ bool LoopUnswitch::processCurrentLoop() { } } - // Do not do non-trivial unswitch while optimizing for size. - // FIXME: Use Function::optForSize(). - if (OptimizeForSize || - loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) - return false; - for (IntrinsicInst *Guard : Guards) { Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first; @@ -856,20 +856,20 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI) { // Check to see if it would be profitable to unswitch current loop. if (!BranchesInfo.CostAllowsUnswitching()) { - DEBUG(dbgs() << "NOT unswitching loop %" - << currentLoop->getHeader()->getName() - << " at non-trivial condition '" << *Val - << "' == " << *LoopCond << "\n" - << ". Cost too high.\n"); + LLVM_DEBUG(dbgs() << "NOT unswitching loop %" + << currentLoop->getHeader()->getName() + << " at non-trivial condition '" << *Val + << "' == " << *LoopCond << "\n" + << ". Cost too high.\n"); return false; } if (hasBranchDivergence && getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) { - DEBUG(dbgs() << "NOT unswitching loop %" - << currentLoop->getHeader()->getName() - << " at non-trivial condition '" << *Val - << "' == " << *LoopCond << "\n" - << ". Condition is divergent.\n"); + LLVM_DEBUG(dbgs() << "NOT unswitching loop %" + << currentLoop->getHeader()->getName() + << " at non-trivial condition '" << *Val + << "' == " << *LoopCond << "\n" + << ". Condition is divergent.\n"); return false; } @@ -910,6 +910,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BranchInst *OldBranch, TerminatorInst *TI) { assert(OldBranch->isUnconditional() && "Preheader is not split correctly"); + assert(TrueDest != FalseDest && "Branch targets should be different"); // Insert a conditional branch on LIC to the two preheaders. The original // code is the true version and the new code is the false version. Value *BranchVal = LIC; @@ -942,9 +943,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, if (DT) { // First, add both successors. SmallVector<DominatorTree::UpdateType, 3> Updates; - if (TrueDest != OldBranchParent) + if (TrueDest != OldBranchSucc) Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest}); - if (FalseDest != OldBranchParent) + if (FalseDest != OldBranchSucc) Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest}); // If both of the new successors are different from the old one, inform the // DT that the edge was deleted. @@ -970,11 +971,15 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, BasicBlock *ExitBlock, TerminatorInst *TI) { - DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" - << loopHeader->getName() << " [" << L->getBlocks().size() - << " blocks] in Function " - << L->getHeader()->getParent()->getName() << " on cond: " << *Val - << " == " << *Cond << "\n"); + LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " + << L->getHeader()->getParent()->getName() + << " on cond: " << *Val << " == " << *Cond << "\n"); + // We are going to make essential changes to CFG. This may invalidate cached + // information for L or one of its parent loops in SCEV. + if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) + SEWP->getSE().forgetTopmostLoop(L); // First step, split the preheader, so that we know that there is a safe place // to insert the conditional branch. We will change loopPreheader to have a @@ -1038,7 +1043,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { // until it finds the trivial condition candidate (condition that is not a // constant). Since unswitching generates branches with constant conditions, // this scenario could be very common in practice. - SmallSet<BasicBlock*, 8> Visited; + SmallPtrSet<BasicBlock*, 8> Visited; while (true) { // If we exit loop or reach a previous visited block, then @@ -1196,13 +1201,15 @@ void LoopUnswitch::SplitExitEdges(Loop *L, void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, Loop *L, TerminatorInst *TI) { Function *F = loopHeader->getParent(); - DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" - << loopHeader->getName() << " [" << L->getBlocks().size() - << " blocks] in Function " << F->getName() - << " when '" << *Val << "' == " << *LIC << "\n"); + LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " << F->getName() << " when '" + << *Val << "' == " << *LIC << "\n"); + // We are going to make essential changes to CFG. This may invalidate cached + // information for L or one of its parent loops in SCEV. if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) - SEWP->getSE().forgetLoop(L); + SEWP->getSE().forgetTopmostLoop(L); LoopBlocks.clear(); NewBlocks.clear(); @@ -1355,7 +1362,7 @@ static void RemoveFromWorklist(Instruction *I, static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, Loop *L, LPPassManager *LPM) { - DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n"); + LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n"); // Add uses to the worklist, which may be dead now. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) @@ -1524,7 +1531,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // Simple DCE. if (isInstructionTriviallyDead(I)) { - DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n"); + LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n"); // Add uses to the worklist, which may be dead now. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) @@ -1557,8 +1564,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { if (!SinglePred) continue; // Nothing to do. assert(SinglePred == Pred && "CFG broken"); - DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " - << Succ->getName() << "\n"); + LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " + << Succ->getName() << "\n"); // Resolve any single entry PHI nodes in Succ. while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 53b25e688e82..06e86081e8a0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -68,6 +68,7 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -85,6 +86,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <cassert> @@ -111,7 +113,7 @@ static cl::opt<unsigned> LVLoopDepthThreshold( "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"), cl::init(2), cl::Hidden); -/// \brief Create MDNode for input string. +/// Create MDNode for input string. static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { LLVMContext &Context = TheLoop->getHeader()->getContext(); Metadata *MDs[] = { @@ -120,7 +122,7 @@ static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { return MDNode::get(Context, MDs); } -/// \brief Set input string into loop metadata by keeping other values intact. +/// Set input string into loop metadata by keeping other values intact. void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V) { SmallVector<Metadata *, 4> MDs(1); @@ -166,6 +168,7 @@ struct LoopVersioningLICM : public LoopPass { AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } StringRef getPassName() const override { return "Loop Versioning for LICM"; } @@ -178,6 +181,7 @@ struct LoopVersioningLICM : public LoopPass { LoadAndStoreCounter = 0; InvariantCounter = 0; IsReadOnlyLoop = true; + ORE = nullptr; CurAST.reset(); } @@ -207,7 +211,7 @@ private: Loop *CurLoop = nullptr; // AliasSet information for the current loop. - std::unique_ptr<AliasSetTracker> CurAST; + std::unique_ptr<AliasSetTracker> CurAST; // Maximum loop nest threshold unsigned LoopDepthThreshold; @@ -224,6 +228,9 @@ private: // Read only loop marker. bool IsReadOnlyLoop = true; + // OptimizationRemarkEmitter + OptimizationRemarkEmitter *ORE; + bool isLegalForVersioning(); bool legalLoopStructure(); bool legalLoopInstructions(); @@ -235,58 +242,57 @@ private: } // end anonymous namespace -/// \brief Check loop structure and confirms it's good for LoopVersioningLICM. +/// Check loop structure and confirms it's good for LoopVersioningLICM. bool LoopVersioningLICM::legalLoopStructure() { // Loop must be in loop simplify form. if (!CurLoop->isLoopSimplifyForm()) { - DEBUG( - dbgs() << " loop is not in loop-simplify form.\n"); + LLVM_DEBUG(dbgs() << " loop is not in loop-simplify form.\n"); return false; } // Loop should be innermost loop, if not return false. if (!CurLoop->getSubLoops().empty()) { - DEBUG(dbgs() << " loop is not innermost\n"); + LLVM_DEBUG(dbgs() << " loop is not innermost\n"); return false; } // Loop should have a single backedge, if not return false. if (CurLoop->getNumBackEdges() != 1) { - DEBUG(dbgs() << " loop has multiple backedges\n"); + LLVM_DEBUG(dbgs() << " loop has multiple backedges\n"); return false; } // Loop must have a single exiting block, if not return false. if (!CurLoop->getExitingBlock()) { - DEBUG(dbgs() << " loop has multiple exiting block\n"); + LLVM_DEBUG(dbgs() << " loop has multiple exiting block\n"); return false; } // We only handle bottom-tested loop, i.e. loop in which the condition is // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) { - DEBUG(dbgs() << " loop is not bottom tested\n"); + LLVM_DEBUG(dbgs() << " loop is not bottom tested\n"); return false; } // Parallel loops must not have aliasing loop-invariant memory accesses. // Hence we don't need to version anything in this case. if (CurLoop->isAnnotatedParallel()) { - DEBUG(dbgs() << " Parallel loop is not worth versioning\n"); + LLVM_DEBUG(dbgs() << " Parallel loop is not worth versioning\n"); return false; } // Loop depth more then LoopDepthThreshold are not allowed if (CurLoop->getLoopDepth() > LoopDepthThreshold) { - DEBUG(dbgs() << " loop depth is more then threshold\n"); + LLVM_DEBUG(dbgs() << " loop depth is more then threshold\n"); return false; } // We need to be able to compute the loop trip count in order // to generate the bound checks. const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop); if (ExitCount == SE->getCouldNotCompute()) { - DEBUG(dbgs() << " loop does not has trip count\n"); + LLVM_DEBUG(dbgs() << " loop does not has trip count\n"); return false; } return true; } -/// \brief Check memory accesses in loop and confirms it's good for +/// Check memory accesses in loop and confirms it's good for /// LoopVersioningLICM. bool LoopVersioningLICM::legalLoopMemoryAccesses() { bool HasMayAlias = false; @@ -328,24 +334,24 @@ bool LoopVersioningLICM::legalLoopMemoryAccesses() { } // Ensure types should be of same type. if (!TypeSafety) { - DEBUG(dbgs() << " Alias tracker type safety failed!\n"); + LLVM_DEBUG(dbgs() << " Alias tracker type safety failed!\n"); return false; } // Ensure loop body shouldn't be read only. if (!HasMod) { - DEBUG(dbgs() << " No memory modified in loop body\n"); + LLVM_DEBUG(dbgs() << " No memory modified in loop body\n"); return false; } // Make sure alias set has may alias case. // If there no alias memory ambiguity, return false. if (!HasMayAlias) { - DEBUG(dbgs() << " No ambiguity in memory access.\n"); + LLVM_DEBUG(dbgs() << " No ambiguity in memory access.\n"); return false; } return true; } -/// \brief Check loop instructions safe for Loop versioning. +/// Check loop instructions safe for Loop versioning. /// It returns true if it's safe else returns false. /// Consider following: /// 1) Check all load store in loop body are non atomic & non volatile. @@ -355,12 +361,12 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { assert(I != nullptr && "Null instruction found!"); // Check function call safety if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) { - DEBUG(dbgs() << " Unsafe call site found.\n"); + LLVM_DEBUG(dbgs() << " Unsafe call site found.\n"); return false; } // Avoid loops with possiblity of throw if (I->mayThrow()) { - DEBUG(dbgs() << " May throw instruction found in loop body\n"); + LLVM_DEBUG(dbgs() << " May throw instruction found in loop body\n"); return false; } // If current instruction is load instructions @@ -368,7 +374,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { if (I->mayReadFromMemory()) { LoadInst *Ld = dyn_cast<LoadInst>(I); if (!Ld || !Ld->isSimple()) { - DEBUG(dbgs() << " Found a non-simple load.\n"); + LLVM_DEBUG(dbgs() << " Found a non-simple load.\n"); return false; } LoadAndStoreCounter++; @@ -382,7 +388,7 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { else if (I->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(I); if (!St || !St->isSimple()) { - DEBUG(dbgs() << " Found a non-simple store.\n"); + LLVM_DEBUG(dbgs() << " Found a non-simple store.\n"); return false; } LoadAndStoreCounter++; @@ -396,59 +402,87 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { return true; } -/// \brief Check loop instructions and confirms it's good for +/// Check loop instructions and confirms it's good for /// LoopVersioningLICM. bool LoopVersioningLICM::legalLoopInstructions() { // Resetting counters. LoadAndStoreCounter = 0; InvariantCounter = 0; IsReadOnlyLoop = true; + using namespace ore; // Iterate over loop blocks and instructions of each block and check // instruction safety. for (auto *Block : CurLoop->getBlocks()) for (auto &Inst : *Block) { // If instruction is unsafe just return false. - if (!instructionSafeForVersioning(&Inst)) + if (!instructionSafeForVersioning(&Inst)) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst) + << " Unsafe Loop Instruction"; + }); return false; + } } // Get LoopAccessInfo from current loop. LAI = &LAA->getInfo(CurLoop); // Check LoopAccessInfo for need of runtime check. if (LAI->getRuntimePointerChecking()->getChecks().empty()) { - DEBUG(dbgs() << " LAA: Runtime check not found !!\n"); + LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n"); return false; } // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold if (LAI->getNumRuntimePointerChecks() > VectorizerParams::RuntimeMemoryCheckThreshold) { - DEBUG(dbgs() << " LAA: Runtime checks are more than threshold !!\n"); + LLVM_DEBUG( + dbgs() << " LAA: Runtime checks are more than threshold !!\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck", + CurLoop->getStartLoc(), + CurLoop->getHeader()) + << "Number of runtime checks " + << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks()) + << " exceeds threshold " + << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold); + }); return false; } // Loop should have at least one invariant load or store instruction. if (!InvariantCounter) { - DEBUG(dbgs() << " Invariant not found !!\n"); + LLVM_DEBUG(dbgs() << " Invariant not found !!\n"); return false; } // Read only loop not allowed. if (IsReadOnlyLoop) { - DEBUG(dbgs() << " Found a read-only loop!\n"); + LLVM_DEBUG(dbgs() << " Found a read-only loop!\n"); return false; } // Profitablity check: // Check invariant threshold, should be in limit. if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) { - DEBUG(dbgs() - << " Invariant load & store are less then defined threshold\n"); - DEBUG(dbgs() << " Invariant loads & stores: " - << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n"); - DEBUG(dbgs() << " Invariant loads & store threshold: " - << InvariantThreshold << "%\n"); + LLVM_DEBUG( + dbgs() + << " Invariant load & store are less then defined threshold\n"); + LLVM_DEBUG(dbgs() << " Invariant loads & stores: " + << ((InvariantCounter * 100) / LoadAndStoreCounter) + << "%\n"); + LLVM_DEBUG(dbgs() << " Invariant loads & store threshold: " + << InvariantThreshold << "%\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold", + CurLoop->getStartLoc(), + CurLoop->getHeader()) + << "Invariant load & store " + << NV("LoadAndStoreCounter", + ((InvariantCounter * 100) / LoadAndStoreCounter)) + << " are less then defined threshold " + << NV("Threshold", InvariantThreshold); + }); return false; } return true; } -/// \brief It checks loop is already visited or not. +/// It checks loop is already visited or not. /// check loop meta data, if loop revisited return true /// else false. bool LoopVersioningLICM::isLoopAlreadyVisited() { @@ -459,42 +493,64 @@ bool LoopVersioningLICM::isLoopAlreadyVisited() { return false; } -/// \brief Checks legality for LoopVersioningLICM by considering following: +/// Checks legality for LoopVersioningLICM by considering following: /// a) loop structure legality b) loop instruction legality /// c) loop memory access legality. /// Return true if legal else returns false. bool LoopVersioningLICM::isLegalForVersioning() { - DEBUG(dbgs() << "Loop: " << *CurLoop); + using namespace ore; + LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop); // Make sure not re-visiting same loop again. if (isLoopAlreadyVisited()) { - DEBUG( + LLVM_DEBUG( dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n"); return false; } // Check loop structure leagality. if (!legalLoopStructure()) { - DEBUG( + LLVM_DEBUG( dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct", + CurLoop->getStartLoc(), + CurLoop->getHeader()) + << " Unsafe Loop structure"; + }); return false; } // Check loop instruction leagality. if (!legalLoopInstructions()) { - DEBUG(dbgs() - << " Loop instructions not suitable for LoopVersioningLICM\n\n"); + LLVM_DEBUG( + dbgs() + << " Loop instructions not suitable for LoopVersioningLICM\n\n"); return false; } // Check loop memory access leagality. if (!legalLoopMemoryAccesses()) { - DEBUG(dbgs() - << " Loop memory access not suitable for LoopVersioningLICM\n\n"); + LLVM_DEBUG( + dbgs() + << " Loop memory access not suitable for LoopVersioningLICM\n\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess", + CurLoop->getStartLoc(), + CurLoop->getHeader()) + << " Unsafe Loop memory access"; + }); return false; } // Loop versioning is feasible, return true. - DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n"); + LLVM_DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n"); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning", + CurLoop->getStartLoc(), CurLoop->getHeader()) + << " Versioned loop for LICM." + << " Number of runtime checks we had to insert " + << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks()); + }); return true; } -/// \brief Update loop with aggressive aliasing assumptions. +/// Update loop with aggressive aliasing assumptions. /// It marks no-alias to any pairs of memory operations by assuming /// loop should not have any must-alias memory accesses pairs. /// During LoopVersioningLICM legality we ignore loops having must @@ -542,6 +598,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); + ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); LAI = nullptr; // Set Current Loop CurLoop = L; @@ -592,6 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm", "Loop Versioning For LICM", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 46f8a3564265..68bfa0030395 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -357,7 +357,7 @@ PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F, } namespace { -/// \brief Legacy pass for lowering expect intrinsics out of the IR. +/// Legacy pass for lowering expect intrinsics out of the IR. /// /// When this pass is run over a function it uses expect intrinsics which feed /// branches and switches to provide branch weight metadata for those diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9c870b42a747..3b74421a47a0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -55,7 +56,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -263,7 +263,7 @@ public: void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); - addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); + addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI); } void addRange(int64_t Start, int64_t Size, Value *Ptr, @@ -479,10 +479,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); - DEBUG(dbgs() << "Replace stores:\n"; - for (Instruction *SI : Range.TheStores) - dbgs() << *SI << '\n'; - dbgs() << "With: " << *AMemSet << '\n'); + LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI + : Range.TheStores) dbgs() + << *SI << '\n'; + dbgs() << "With: " << *AMemSet << '\n'); if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); @@ -498,16 +498,25 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, return AMemSet; } -static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, - const LoadInst *LI) { +static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { unsigned StoreAlign = SI->getAlignment(); if (!StoreAlign) StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); + return StoreAlign; +} + +static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { unsigned LoadAlign = LI->getAlignment(); if (!LoadAlign) LoadAlign = DL.getABITypeAlignment(LI->getType()); + return LoadAlign; +} - return std::min(StoreAlign, LoadAlign); +static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + unsigned StoreAlign = findStoreAlignment(DL, SI); + unsigned LoadAlign = findLoadAlignment(DL, LI); + return MinAlign(StoreAlign, LoadAlign); } // This method try to lift a store instruction before position P. @@ -522,7 +531,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, return false; // Keep track of the arguments of all instruction we plan to lift - // so we can make sure to lift them as well if apropriate. + // so we can make sure to lift them as well if appropriate. DenseSet<Instruction*> Args; if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) if (Ptr->getParent() == SI->getParent()) @@ -594,7 +603,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, // We made it, we need to lift for (auto *I : llvm::reverse(ToLift)) { - DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); + LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); I->moveBefore(P); } @@ -656,22 +665,23 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc)) UseMemMove = true; - unsigned Align = findCommonAlignment(DL, SI, LI); uint64_t Size = DL.getTypeStoreSize(T); IRBuilder<> Builder(P); Instruction *M; if (UseMemMove) - M = Builder.CreateMemMove(SI->getPointerOperand(), - LI->getPointerOperand(), Size, - Align, SI->isVolatile()); + M = Builder.CreateMemMove( + SI->getPointerOperand(), findStoreAlignment(DL, SI), + LI->getPointerOperand(), findLoadAlignment(DL, LI), Size, + SI->isVolatile()); else - M = Builder.CreateMemCpy(SI->getPointerOperand(), - LI->getPointerOperand(), Size, - Align, SI->isVolatile()); + M = Builder.CreateMemCpy( + SI->getPointerOperand(), findStoreAlignment(DL, SI), + LI->getPointerOperand(), findLoadAlignment(DL, LI), Size, + SI->isVolatile()); - DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI - << " => " << *M << "\n"); + LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " + << *M << "\n"); MD->removeInstruction(SI); SI->eraseFromParent(); @@ -760,7 +770,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, Align, SI->isVolatile()); - DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); + LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); MD->removeInstruction(SI); SI->eraseFromParent(); @@ -1047,20 +1057,17 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // If all checks passed, then we can transform M. - // Make sure to use the lesser of the alignment of the source and the dest - // since we're changing where we're reading from, but don't want to increase - // the alignment past what can be read from or written to. // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. - unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); - IRBuilder<> Builder(M); if (UseMemMove) - Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), - Align, M->isVolatile()); + Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), M->isVolatile()); else - Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), - Align, M->isVolatile()); + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), + MDep->getRawSource(), MDep->getSourceAlignment(), + M->getLength(), M->isVolatile()); // Remove the instruction we're replacing. MD->removeInstruction(M); @@ -1106,7 +1113,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, // If Dest is aligned, and SrcSize is constant, use the minimum alignment // of the sum. const unsigned DestAlign = - std::max(MemSet->getAlignment(), MemCpy->getAlignment()); + std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); if (DestAlign > 1) if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); @@ -1166,7 +1173,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, IRBuilder<> Builder(MemCpy); Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), - CopySize, MemCpy->getAlignment()); + CopySize, MemCpy->getDestAlignment()); return true; } @@ -1192,7 +1199,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), - M->getAlignment(), false); + M->getDestAlignment(), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; @@ -1221,8 +1228,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { // d) memcpy from a just-memset'd source can be turned into memset. if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { + // FIXME: Can we pass in either of dest/src alignment here instead + // of conservatively taking the minimum? + unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment()); if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), M->getAlignment(), + CopySize->getZExtValue(), Align, C)) { MD->removeInstruction(M); M->eraseFromParent(); @@ -1284,8 +1294,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { MemoryLocation::getForSource(M))) return false; - DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M - << "\n"); + LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M + << "\n"); // If not, then we know we can transform this. Type *ArgTys[3] = { M->getRawDest()->getType(), @@ -1337,7 +1347,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { // source of the memcpy to the alignment we need. If we fail, we bail out. AssumptionCache &AC = LookupAssumptionCache(); DominatorTree &DT = LookupDomTree(); - if (MDep->getAlignment() < ByValAlign && + if (MDep->getSourceAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, CS.getInstruction(), &AC, &DT) < ByValAlign) return false; @@ -1367,9 +1377,9 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); - DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" - << " " << *MDep << "\n" - << " " << *CS.getInstruction() << "\n"); + LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" + << " " << *MDep << "\n" + << " " << *CS.getInstruction() << "\n"); // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); @@ -1381,10 +1391,19 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { bool MemCpyOptPass::iterateOnFunction(Function &F) { bool MadeChange = false; + DominatorTree &DT = LookupDomTree(); + // Walk all instruction in the function. for (BasicBlock &BB : F) { + // Skip unreachable blocks. For example processStore assumes that an + // instruction in a BB can't be dominated by a later instruction in the + // same BB (which is a scenario that can happen for an unreachable BB that + // has itself as a predecessor). + if (!DT.isReachableFromEntry(&BB)) + continue; + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { - // Avoid invalidating the iterator. + // Avoid invalidating the iterator. Instruction *I = &*BI++; bool RepeatInstruction = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 9869a3fb96fa..ff0183a8ea2d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -71,30 +71,30 @@ struct BCEAtom { }; // If this value is a load from a constant offset w.r.t. a base address, and -// there are no othe rusers of the load or address, returns the base address and +// there are no other users of the load or address, returns the base address and // the offset. BCEAtom visitICmpLoadOperand(Value *const Val) { BCEAtom Result; if (auto *const LoadI = dyn_cast<LoadInst>(Val)) { - DEBUG(dbgs() << "load\n"); + LLVM_DEBUG(dbgs() << "load\n"); if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { - DEBUG(dbgs() << "used outside of block\n"); + LLVM_DEBUG(dbgs() << "used outside of block\n"); return {}; } if (LoadI->isVolatile()) { - DEBUG(dbgs() << "volatile\n"); + LLVM_DEBUG(dbgs() << "volatile\n"); return {}; } Value *const Addr = LoadI->getOperand(0); if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) { - DEBUG(dbgs() << "GEP\n"); + LLVM_DEBUG(dbgs() << "GEP\n"); if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { - DEBUG(dbgs() << "used outside of block\n"); + LLVM_DEBUG(dbgs() << "used outside of block\n"); return {}; } const auto &DL = GEP->getModule()->getDataLayout(); if (!isDereferenceablePointer(GEP, DL)) { - DEBUG(dbgs() << "not dereferenceable\n"); + LLVM_DEBUG(dbgs() << "not dereferenceable\n"); // We need to make sure that we can do comparison in any order, so we // require memory to be unconditionnally dereferencable. return {}; @@ -110,6 +110,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val) { } // A basic block with a comparison between two BCE atoms. +// The block might do extra work besides the atom comparison, in which case +// doesOtherWork() returns true. Under some conditions, the block can be +// split into the atom comparison part and the "other work" part +// (see canSplit()). // Note: the terminology is misleading: the comparison is symmetric, so there // is no real {l/r}hs. What we want though is to have the same base on the // left (resp. right), so that we can detect consecutive loads. To ensure this @@ -127,7 +131,7 @@ class BCECmpBlock { return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr; } - // Assert the the block is consistent: If valid, it should also have + // Assert the block is consistent: If valid, it should also have // non-null members besides Lhs_ and Rhs_. void AssertConsistent() const { if (IsValid()) { @@ -144,37 +148,95 @@ class BCECmpBlock { // Returns true if the block does other works besides comparison. bool doesOtherWork() const; + // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp + // instructions in the block. + bool canSplit() const; + + // Return true if this all the relevant instructions in the BCE-cmp-block can + // be sunk below this instruction. By doing this, we know we can separate the + // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the + // block. + bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const; + + // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block + // instructions. Split the old block and move all non-BCE-cmp-insts into the + // new parent block. + void split(BasicBlock *NewParent) const; + // The basic block where this comparison happens. BasicBlock *BB = nullptr; // The ICMP for this comparison. ICmpInst *CmpI = nullptr; // The terminating branch. BranchInst *BranchI = nullptr; + // The block requires splitting. + bool RequireSplit = false; - private: +private: BCEAtom Lhs_; BCEAtom Rhs_; int SizeBits_ = 0; }; +bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, + DenseSet<Instruction *> &BlockInsts) const { + // If this instruction has side effects and its in middle of the BCE cmp block + // instructions, then bail for now. + // TODO: use alias analysis to tell whether there is real interference. + if (Inst->mayHaveSideEffects()) + return false; + // Make sure this instruction does not use any of the BCE cmp block + // instructions as operand. + for (auto BI : BlockInsts) { + if (is_contained(Inst->operands(), BI)) + return false; + } + return true; +} + +void BCECmpBlock::split(BasicBlock *NewParent) const { + DenseSet<Instruction *> BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); + llvm::SmallVector<Instruction *, 4> OtherInsts; + for (Instruction &Inst : *BB) { + if (BlockInsts.count(&Inst)) + continue; + assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block"); + // This is a non-BCE-cmp-block instruction. And it can be separated + // from the BCE-cmp-block instruction. + OtherInsts.push_back(&Inst); + } + + // Do the actual spliting. + for (Instruction *Inst : reverse(OtherInsts)) { + Inst->moveBefore(&*NewParent->begin()); + } +} + +bool BCECmpBlock::canSplit() const { + DenseSet<Instruction *> BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); + for (Instruction &Inst : *BB) { + if (!BlockInsts.count(&Inst)) { + if (!canSinkBCECmpInst(&Inst, BlockInsts)) + return false; + } + } + return true; +} + bool BCECmpBlock::doesOtherWork() const { AssertConsistent(); + // All the instructions we care about in the BCE cmp block. + DenseSet<Instruction *> BlockInsts( + {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); // TODO(courbet): Can we allow some other things ? This is very conservative. - // We might be able to get away with anything does does not have any side + // We might be able to get away with anything does not have any side // effects outside of the basic block. // Note: The GEPs and/or loads are not necessarily in the same block. for (const Instruction &Inst : *BB) { - if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) { - if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true; - } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) { - if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true; - } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) { - if (C != CmpI) return true; - } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) { - if (Br != BranchI) return true; - } else { + if (!BlockInsts.count(&Inst)) return true; - } } return false; } @@ -183,10 +245,19 @@ bool BCECmpBlock::doesOtherWork() const { // BCE atoms, returns the comparison. BCECmpBlock visitICmp(const ICmpInst *const CmpI, const ICmpInst::Predicate ExpectedPredicate) { + // The comparison can only be used once: + // - For intermediate blocks, as a branch condition. + // - For the final block, as an incoming value for the Phi. + // If there are any other uses of the comparison, we cannot merge it with + // other comparisons as we would create an orphan use of the value. + if (!CmpI->hasOneUse()) { + LLVM_DEBUG(dbgs() << "cmp has several uses\n"); + return {}; + } if (CmpI->getPredicate() == ExpectedPredicate) { - DEBUG(dbgs() << "cmp " - << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") - << "\n"); + LLVM_DEBUG(dbgs() << "cmp " + << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") + << "\n"); auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0)); if (!Lhs.Base()) return {}; auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1)); @@ -204,7 +275,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, if (Block->empty()) return {}; auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); if (!BranchI) return {}; - DEBUG(dbgs() << "branch\n"); + LLVM_DEBUG(dbgs() << "branch\n"); if (BranchI->isUnconditional()) { // In this case, we expect an incoming value which is the result of the // comparison. This is the last link in the chain of comparisons (note @@ -212,7 +283,7 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, // can be reordered). auto *const CmpI = dyn_cast<ICmpInst>(Val); if (!CmpI) return {}; - DEBUG(dbgs() << "icmp\n"); + LLVM_DEBUG(dbgs() << "icmp\n"); auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ); Result.CmpI = CmpI; Result.BranchI = BranchI; @@ -221,12 +292,12 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, // In this case, we expect a constant incoming value (the comparison is // chained). const auto *const Const = dyn_cast<ConstantInt>(Val); - DEBUG(dbgs() << "const\n"); + LLVM_DEBUG(dbgs() << "const\n"); if (!Const->isZero()) return {}; - DEBUG(dbgs() << "false\n"); + LLVM_DEBUG(dbgs() << "false\n"); auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition()); if (!CmpI) return {}; - DEBUG(dbgs() << "icmp\n"); + LLVM_DEBUG(dbgs() << "icmp\n"); assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); BasicBlock *const FalseBlock = BranchI->getSuccessor(1); auto Result = visitICmp( @@ -238,6 +309,18 @@ BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, return {}; } +static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, + BCECmpBlock &Comparison) { + LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName() + << "': Found cmp of " << Comparison.SizeBits() + << " bits between " << Comparison.Lhs().Base() << " + " + << Comparison.Lhs().Offset << " and " + << Comparison.Rhs().Base() << " + " + << Comparison.Rhs().Offset << "\n"); + LLVM_DEBUG(dbgs() << "\n"); + Comparisons.push_back(Comparison); +} + // A chain of comparisons. class BCECmpChain { public: @@ -263,9 +346,9 @@ class BCECmpChain { // Merges the given comparison blocks into one memcmp block and update // branches. Comparisons are assumed to be continguous. If NextBBInChain is // null, the merged block will link to the phi block. - static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, - BasicBlock *const NextBBInChain, PHINode &Phi, - const TargetLibraryInfo *const TLI); + void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, + BasicBlock *const NextBBInChain, PHINode &Phi, + const TargetLibraryInfo *const TLI); PHINode &Phi_; std::vector<BCECmpBlock> Comparisons_; @@ -275,24 +358,47 @@ class BCECmpChain { BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi) : Phi_(Phi) { + assert(!Blocks.empty() && "a chain should have at least one block"); // Now look inside blocks to check for BCE comparisons. std::vector<BCECmpBlock> Comparisons; - for (BasicBlock *Block : Blocks) { + for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) { + BasicBlock *const Block = Blocks[BlockIdx]; + assert(Block && "invalid block"); BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), Block, Phi.getParent()); Comparison.BB = Block; if (!Comparison.IsValid()) { - DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n"); + LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n"); return; } if (Comparison.doesOtherWork()) { - DEBUG(dbgs() << "block does extra work besides compare\n"); - if (Comparisons.empty()) { // First block. - // TODO(courbet): The first block can do other things, and we should - // split them apart in a separate block before the comparison chain. - // Right now we just discard it and make the chain shorter. - DEBUG(dbgs() - << "ignoring first block that does extra work besides compare\n"); + LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName() + << "' does extra work besides compare\n"); + if (Comparisons.empty()) { + // This is the initial block in the chain, in case this block does other + // work, we can try to split the block and move the irrelevant + // instructions to the predecessor. + // + // If this is not the initial block in the chain, splitting it wont + // work. + // + // As once split, there will still be instructions before the BCE cmp + // instructions that do other work in program order, i.e. within the + // chain before sorting. Unless we can abort the chain at this point + // and start anew. + // + // NOTE: we only handle block with single predecessor for now. + if (Comparison.canSplit()) { + LLVM_DEBUG(dbgs() + << "Split initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + Comparison.RequireSplit = true; + enqueueBlock(Comparisons, Comparison); + } else { + LLVM_DEBUG(dbgs() + << "ignoring initial block '" << Comparison.BB->getName() + << "' that does extra work besides compare\n"); + } continue; } // TODO(courbet): Right now we abort the whole chain. We could be @@ -320,13 +426,13 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi) // We could still merge bb1 and bb2 though. return; } - DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits() - << " bits between " << Comparison.Lhs().Base() << " + " - << Comparison.Lhs().Offset << " and " - << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset - << "\n"); - DEBUG(dbgs() << "\n"); - Comparisons.push_back(Comparison); + enqueueBlock(Comparisons, Comparison); + } + + // It is possible we have no suitable comparison to merge. + if (Comparisons.empty()) { + LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n"); + return; } EntryBlock_ = Comparisons[0].BB; Comparisons_ = std::move(Comparisons); @@ -336,10 +442,10 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi) #endif // MERGEICMPS_DOT_ON // Reorder blocks by LHS. We can do that without changing the // semantics because we are only accessing dereferencable memory. - std::sort(Comparisons_.begin(), Comparisons_.end(), - [](const BCECmpBlock &a, const BCECmpBlock &b) { - return a.Lhs() < b.Lhs(); - }); + llvm::sort(Comparisons_.begin(), Comparisons_.end(), + [](const BCECmpBlock &a, const BCECmpBlock &b) { + return a.Lhs() < b.Lhs(); + }); #ifdef MERGEICMPS_DOT_ON errs() << "AFTER REORDERING:\n\n"; dump(); @@ -389,10 +495,24 @@ bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) { Phi_.removeIncomingValue(Comparison.BB, false); } + // If entry block is part of the chain, we need to make the first block + // of the chain the new entry block of the function. + BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock(); + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (Entry == Comparisons_[I].BB) { + BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "", + Entry->getParent(), Entry); + BranchInst::Create(Entry, NEntryBB); + break; + } + } + // Point the predecessors of the chain to the first comparison block (which is - // the new entry point). - if (EntryBlock_ != Comparisons_[0].BB) + // the new entry point) and update the entry block of the chain. + if (EntryBlock_ != Comparisons_[0].BB) { EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + EntryBlock_ = Comparisons_[0].BB; + } // Effectively merge blocks. int NumMerged = 1; @@ -424,7 +544,15 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, LLVMContext &Context = BB->getContext(); if (Comparisons.size() >= 2) { - DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); + // If there is one block that requires splitting, we do it now, i.e. + // just before we know we will collapse the chain. The instructions + // can be executed before any of the instructions in the chain. + auto C = std::find_if(Comparisons.begin(), Comparisons.end(), + [](const BCECmpBlock &B) { return B.RequireSplit; }); + if (C != Comparisons.end()) + C->split(EntryBlock_); + + LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); const auto TotalSize = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, [](int Size, const BCECmpBlock &C) { @@ -445,7 +573,8 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, IRBuilder<> Builder(BB); const auto &DL = Phi.getModule()->getDataLayout(); Value *const MemCmpCall = emitMemCmp( - FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize), + FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, + ConstantInt::get(DL.getIntPtrType(Context), TotalSize), Builder, DL, TLI); Value *const MemCmpIsZero = Builder.CreateICmpEQ( MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); @@ -468,17 +597,17 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, } else { assert(Comparisons.size() == 1); // There are no blocks to merge, but we still need to update the branches. - DEBUG(dbgs() << "Only one comparison, updating branches\n"); + LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); if (NextBBInChain) { if (FirstComparison.BranchI->isConditional()) { - DEBUG(dbgs() << "conditional -> conditional\n"); + LLVM_DEBUG(dbgs() << "conditional -> conditional\n"); // Just update the "true" target, the "false" target should already be // the phi block. assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); FirstComparison.BranchI->setSuccessor(0, NextBBInChain); Phi.addIncoming(ConstantInt::getFalse(Context), BB); } else { - DEBUG(dbgs() << "unconditional -> conditional\n"); + LLVM_DEBUG(dbgs() << "unconditional -> conditional\n"); // Replace the unconditional branch by a conditional one. FirstComparison.BranchI->eraseFromParent(); IRBuilder<> Builder(BB); @@ -488,14 +617,14 @@ void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, } } else { if (FirstComparison.BranchI->isConditional()) { - DEBUG(dbgs() << "conditional -> unconditional\n"); + LLVM_DEBUG(dbgs() << "conditional -> unconditional\n"); // Replace the conditional branch by an unconditional one. FirstComparison.BranchI->eraseFromParent(); IRBuilder<> Builder(BB); Builder.CreateBr(Phi.getParent()); Phi.addIncoming(FirstComparison.CmpI, BB); } else { - DEBUG(dbgs() << "unconditional -> unconditional\n"); + LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n"); Phi.addIncoming(FirstComparison.CmpI, BB); } } @@ -507,27 +636,28 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, int NumBlocks) { // Walk up from the last block to find other blocks. std::vector<BasicBlock *> Blocks(NumBlocks); + assert(LastBlock && "invalid last block"); BasicBlock *CurBlock = LastBlock; for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { if (CurBlock->hasAddressTaken()) { // Somebody is jumping to the block through an address, all bets are // off. - DEBUG(dbgs() << "skip: block " << BlockIndex - << " has its address taken\n"); + LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex + << " has its address taken\n"); return {}; } Blocks[BlockIndex] = CurBlock; auto *SinglePredecessor = CurBlock->getSinglePredecessor(); if (!SinglePredecessor) { // The block has two or more predecessors. - DEBUG(dbgs() << "skip: block " << BlockIndex - << " has two or more predecessors\n"); + LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex + << " has two or more predecessors\n"); return {}; } if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { // The block does not link back to the phi. - DEBUG(dbgs() << "skip: block " << BlockIndex - << " does not link back to the phi\n"); + LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex + << " does not link back to the phi\n"); return {}; } CurBlock = SinglePredecessor; @@ -537,9 +667,9 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, } bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { - DEBUG(dbgs() << "processPhi()\n"); + LLVM_DEBUG(dbgs() << "processPhi()\n"); if (Phi.getNumIncomingValues() <= 1) { - DEBUG(dbgs() << "skip: only one incoming value in phi\n"); + LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); return false; } // We are looking for something that has the following structure: @@ -552,7 +682,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { // - The last basic block (bb4 here) must branch unconditionally to bb_phi. // It's the only block that contributes a non-constant value to the Phi. // - All other blocks (b1, b2, b3) must have exactly two successors, one of - // them being the the phi block. + // them being the phi block. // - All intermediate blocks (bb2, bb3) must have only one predecessor. // - Blocks cannot do other work besides the comparison, see doesOtherWork() @@ -563,18 +693,31 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue; if (LastBlock) { // There are several non-constant values. - DEBUG(dbgs() << "skip: several non-constant values\n"); + LLVM_DEBUG(dbgs() << "skip: several non-constant values\n"); + return false; + } + if (!isa<ICmpInst>(Phi.getIncomingValue(I)) || + cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() != + Phi.getIncomingBlock(I)) { + // Non-constant incoming value is not from a cmp instruction or not + // produced by the last block. We could end up processing the value + // producing block more than once. + // + // This is an uncommon case, so we bail. + LLVM_DEBUG( + dbgs() + << "skip: non-constant value not from cmp or not from last block.\n"); return false; } LastBlock = Phi.getIncomingBlock(I); } if (!LastBlock) { // There is no non-constant block. - DEBUG(dbgs() << "skip: no non-constant block\n"); + LLVM_DEBUG(dbgs() << "skip: no non-constant block\n"); return false; } if (LastBlock->getSingleSuccessor() != Phi.getParent()) { - DEBUG(dbgs() << "skip: last block non-phi successor\n"); + LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n"); return false; } @@ -584,7 +727,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { BCECmpChain CmpChain(Blocks, Phi); if (CmpChain.size() < 2) { - DEBUG(dbgs() << "skip: only one compare block\n"); + LLVM_DEBUG(dbgs() << "skip: only one compare block\n"); return false; } @@ -619,12 +762,16 @@ class MergeICmps : public FunctionPass { PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI) { - DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); // We only try merging comparisons if the target wants to expand memcmp later. // The rationale is to avoid turning small chains into memcmp calls. if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all(); + // If we don't have memcmp avaiable we can't emit calls to it. + if (!TLI->has(LibFunc_memcmp)) + return PreservedAnalyses::all(); + bool MadeChange = false; for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index f2f615cb9b0f..3464b759280f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // //! \file -//! \brief This pass performs merges of loads and stores on both sides of a +//! This pass performs merges of loads and stores on both sides of a // diamond (hammock). It hoists the loads and sinks the stores. // // The algorithm iteratively hoists two loads to the same address out of a @@ -80,7 +80,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" @@ -97,7 +96,6 @@ namespace { // MergedLoadStoreMotion Pass //===----------------------------------------------------------------------===// class MergedLoadStoreMotion { - MemoryDependenceResults *MD = nullptr; AliasAnalysis *AA = nullptr; // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, @@ -107,14 +105,9 @@ class MergedLoadStoreMotion { const int MagicCompileTimeControl = 250; public: - bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA); + bool run(Function &F, AliasAnalysis &AA); private: - /// - /// \brief Remove instruction from parent and update memory dependence - /// analysis. - /// - void removeInstruction(Instruction *Inst); BasicBlock *getDiamondTail(BasicBlock *BB); bool isDiamondHead(BasicBlock *BB); // Routines for sinking stores @@ -128,23 +121,7 @@ private: } // end anonymous namespace /// -/// \brief Remove instruction from parent and update memory dependence analysis. -/// -void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { - // Notify the memory dependence analysis. - if (MD) { - MD->removeInstruction(Inst); - if (auto *LI = dyn_cast<LoadInst>(Inst)) - MD->invalidateCachedPointerInfo(LI->getPointerOperand()); - if (Inst->getType()->isPtrOrPtrVectorTy()) { - MD->invalidateCachedPointerInfo(Inst); - } - } - Inst->eraseFromParent(); -} - -/// -/// \brief Return tail block of a diamond. +/// Return tail block of a diamond. /// BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); @@ -152,7 +129,7 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { } /// -/// \brief True when BB is the head of a diamond (hammock) +/// True when BB is the head of a diamond (hammock) /// bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { if (!BB) @@ -179,7 +156,7 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// -/// \brief True when instruction is a sink barrier for a store +/// True when instruction is a sink barrier for a store /// located in Loc /// /// Whenever an instruction could possibly read or modify the @@ -197,13 +174,13 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, } /// -/// \brief Check if \p BB contains a store to the same address as \p SI +/// Check if \p BB contains a store to the same address as \p SI /// /// \return The store in \p when it is safe to sink. Otherwise return Null. /// StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, StoreInst *Store0) { - DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); BasicBlock *BB0 = Store0->getParent(); for (Instruction &Inst : reverse(*BB1)) { auto *Store1 = dyn_cast<StoreInst>(&Inst); @@ -222,7 +199,7 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, } /// -/// \brief Create a PHI node in BB for the operands of S0 and S1 +/// Create a PHI node in BB for the operands of S0 and S1 /// PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { @@ -236,13 +213,11 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, &BB->front()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); - if (MD && NewPN->getType()->isPtrOrPtrVectorTy()) - MD->invalidateCachedPointerInfo(NewPN); return NewPN; } /// -/// \brief Merge two stores to same address and sink into \p BB +/// Merge two stores to same address and sink into \p BB /// /// Also sinks GEP instruction computing the store address /// @@ -254,9 +229,9 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && (A0->getParent() == S0->getParent()) && A1->hasOneUse() && (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { - DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); - dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; - dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); // Hoist the instruction. BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); // Intersect optional metadata. @@ -275,19 +250,19 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, // New PHI operand? Use it. if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) SNew->setOperand(0, NewPN); - removeInstruction(S0); - removeInstruction(S1); + S0->eraseFromParent(); + S1->eraseFromParent(); A0->replaceAllUsesWith(ANew); - removeInstruction(A0); + A0->eraseFromParent(); A1->replaceAllUsesWith(ANew); - removeInstruction(A1); + A1->eraseFromParent(); return true; } return false; } /// -/// \brief True when two stores are equivalent and can sink into the footer +/// True when two stores are equivalent and can sink into the footer /// /// Starting from a diamond tail block, iterate over the instructions in one /// predecessor block and try to match a store in the second predecessor. @@ -310,7 +285,8 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { return false; // No. More than 2 predecessors. // #Instructions in Succ1 for Compile Time Control - int Size1 = Pred1->size(); + auto InstsNoDbg = Pred1->instructionsWithoutDebug(); + int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end()); int NStores = 0; for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend(); @@ -338,19 +314,17 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { break; RBI = Pred0->rbegin(); RBE = Pred0->rend(); - DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); + LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); } } return MergedStores; } -bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD, - AliasAnalysis &AA) { - this->MD = MD; +bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { this->AA = &AA; bool Changed = false; - DEBUG(dbgs() << "Instruction Merger\n"); + LLVM_DEBUG(dbgs() << "Instruction Merger\n"); // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. @@ -376,15 +350,13 @@ public: } /// - /// \brief Run the transformation for each function + /// Run the transformation for each function /// bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; MergedLoadStoreMotion Impl; - auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>(); - return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr, - getAnalysis<AAResultsWrapperPass>().getAAResults()); + return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults()); } private: @@ -392,7 +364,6 @@ private: AU.setPreservesCFG(); AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<MemoryDependenceWrapperPass>(); } }; @@ -400,7 +371,7 @@ char MergedLoadStoreMotionLegacyPass::ID = 0; } // anonymous namespace /// -/// \brief createMergedLoadStoreMotionPass - The public interface to this file. +/// createMergedLoadStoreMotionPass - The public interface to this file. /// FunctionPass *llvm::createMergedLoadStoreMotionPass() { return new MergedLoadStoreMotionLegacyPass(); @@ -408,7 +379,6 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() { INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -416,14 +386,12 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", PreservedAnalyses MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { MergedLoadStoreMotion Impl; - auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F); auto &AA = AM.getResult<AAManager>(F); - if (!Impl.run(F, MD, AA)) + if (!Impl.run(F, AA)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); - PA.preserve<MemoryDependenceAnalysis>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index b026c8d692c3..7106ea216ad6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -83,6 +83,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -105,7 +106,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> @@ -240,10 +240,17 @@ bool NaryReassociatePass::doOneIteration(Function &F) { Changed = true; SE->forgetValue(&*I); I->replaceAllUsesWith(NewI); - // If SeenExprs constains I's WeakTrackingVH, that entry will be - // replaced with - // nullptr. + WeakVH NewIExist = NewI; + // If SeenExprs/NewIExist contains I's WeakTrackingVH/WeakVH, that + // entry will be replaced with nullptr if deleted. RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); + if (!NewIExist) { + // Rare occation where the new instruction (NewI) have been removed, + // probably due to parts of the input code was dead from the + // beginning, reset the iterator and start over from the beginning + I = BB->begin(); + continue; + } I = NewI->getIterator(); } // Add the rewritten instruction to SeenExprs; the original instruction @@ -429,6 +436,9 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) { Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + // There is no need to reassociate 0. + if (SE->getSCEV(I)->isZero()) + return nullptr; if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I)) return NewI; if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I)) diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp index 9ebf2d769356..2eb887c986be 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -77,6 +77,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -105,7 +106,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include <algorithm> @@ -221,13 +221,13 @@ private: Components.resize(Components.size() + 1); auto &Component = Components.back(); Component.insert(I); - DEBUG(dbgs() << "Component root is " << *I << "\n"); + LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n"); InComponent.insert(I); ValueToComponent[I] = ComponentID; // Pop a component off the stack and label it. while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) { auto *Member = Stack.back(); - DEBUG(dbgs() << "Component member is " << *Member << "\n"); + LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n"); Component.insert(Member); InComponent.insert(Member); ValueToComponent[Member] = ComponentID; @@ -366,9 +366,8 @@ public: // True if this class has no memory members. bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } - // Return true if two congruence classes are equivalent to each other. This - // means - // that every field but the ID number and the dead field are equivalent. + // Return true if two congruence classes are equivalent to each other. This + // means that every field but the ID number and the dead field are equivalent. bool isEquivalentTo(const CongruenceClass *Other) const { if (!Other) return false; @@ -383,10 +382,12 @@ public: if (!DefiningExpr || !Other->DefiningExpr || *DefiningExpr != *Other->DefiningExpr) return false; - // We need some ordered set - std::set<Value *> AMembers(Members.begin(), Members.end()); - std::set<Value *> BMembers(Members.begin(), Members.end()); - return AMembers == BMembers; + + if (Members.size() != Other->Members.size()) + return false; + + return all_of(Members, + [&](const Value *V) { return Other->Members.count(V); }); } private: @@ -860,7 +861,7 @@ private: // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. - std::pair<int, int> StartingVNCounter; + int64_t StartingVNCounter; }; } // end anonymous namespace @@ -958,7 +959,8 @@ static bool isCopyOfAPHI(const Value *V) { // order. The BlockInstRange numbers are generated in an RPO walk of the basic // blocks. void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const { - std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) { + llvm::sort(Ops.begin(), Ops.end(), + [&](const ValPair &P1, const ValPair &P2) { return BlockInstRange.lookup(P1.second).first < BlockInstRange.lookup(P2.second).first; }); @@ -1067,8 +1069,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, return nullptr; if (auto *C = dyn_cast<Constant>(V)) { if (I) - DEBUG(dbgs() << "Simplified " << *I << " to " - << " constant " << *C << "\n"); + LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " + << " constant " << *C << "\n"); NumGVNOpsSimplified++; assert(isa<BasicExpression>(E) && "We should always have had a basic expression here"); @@ -1076,8 +1078,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, return createConstantExpression(C); } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) { if (I) - DEBUG(dbgs() << "Simplified " << *I << " to " - << " variable " << *V << "\n"); + LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " + << " variable " << *V << "\n"); deleteExpression(E); return createVariableExpression(V); } @@ -1100,8 +1102,8 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, } if (I) - DEBUG(dbgs() << "Simplified " << *I << " to " - << " expression " << *CC->getDefiningExpr() << "\n"); + LLVM_DEBUG(dbgs() << "Simplified " << *I << " to " + << " expression " << *CC->getDefiningExpr() << "\n"); NumGVNOpsSimplified++; deleteExpression(E); return CC->getDefiningExpr(); @@ -1257,7 +1259,7 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst, // This must be an instruction because we are only called from phi nodes // in the case that the value it needs to check against is an instruction. - // The most likely candiates for dominance are the leader and the next leader. + // The most likely candidates for dominance are the leader and the next leader. // The leader or nextleader will dominate in all cases where there is an // equivalent that is higher up in the dom tree. // We can't *only* check them, however, because the @@ -1421,8 +1423,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (Offset >= 0) { if (auto *C = dyn_cast<Constant>( lookupOperandLeader(DepSI->getValueOperand()))) { - DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant " - << *C << "\n"); + LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI + << " to constant " << *C << "\n"); return createConstantExpression( getConstantStoreValueForLoad(C, Offset, LoadType, DL)); } @@ -1437,8 +1439,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI))) if (auto *PossibleConstant = getConstantLoadValueForLoad(C, Offset, LoadType, DL)) { - DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant " - << *PossibleConstant << "\n"); + LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI + << " to constant " << *PossibleConstant << "\n"); return createConstantExpression(PossibleConstant); } } @@ -1447,8 +1449,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (Offset >= 0) { if (auto *PossibleConstant = getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) { - DEBUG(dbgs() << "Coercing load from meminst " << *DepMI - << " to constant " << *PossibleConstant << "\n"); + LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI + << " to constant " << *PossibleConstant << "\n"); return createConstantExpression(PossibleConstant); } } @@ -1529,7 +1531,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { if (!PI) return nullptr; - DEBUG(dbgs() << "Found predicate info from instruction !\n"); + LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n"); auto *PWC = dyn_cast<PredicateWithCondition>(PI); if (!PWC) @@ -1569,7 +1571,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { return nullptr; if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) { - DEBUG(dbgs() << "Copy is not of any condition operands!\n"); + LLVM_DEBUG(dbgs() << "Copy is not of any condition operands!\n"); return nullptr; } Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); @@ -1584,11 +1586,11 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate(); if (isa<PredicateAssume>(PI)) { - // If the comparison is true when the operands are equal, then we know the - // operands are equal, because assumes must always be true. - if (CmpInst::isTrueWhenEqual(Predicate)) { + // If we assume the operands are equal, then they are equal. + if (Predicate == CmpInst::ICMP_EQ) { addPredicateUsers(PI, I); - addAdditionalUsers(Cmp->getOperand(0), I); + addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0), + I); return createVariableOrConstant(FirstOp); } } @@ -1622,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { auto *CI = cast<CallInst>(I); if (auto *II = dyn_cast<IntrinsicInst>(I)) { - // Instrinsics with the returned attribute are copies of arguments. + // Intrinsics with the returned attribute are copies of arguments. if (auto *ReturnedValue = II->getReturnedArgOperand()) { if (II->getIntrinsicID() == Intrinsic::ssa_copy) if (const auto *Result = performSymbolicPredicateInfoEvaluation(I)) @@ -1652,10 +1654,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, CongruenceClass *NewClass) { assert(NewClass && "Every MemoryAccess should be getting mapped to a non-null class"); - DEBUG(dbgs() << "Setting " << *From); - DEBUG(dbgs() << " equivalent to congruence class "); - DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader "); - DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); + LLVM_DEBUG(dbgs() << "Setting " << *From); + LLVM_DEBUG(dbgs() << " equivalent to congruence class "); + LLVM_DEBUG(dbgs() << NewClass->getID() + << " with current MemoryAccess leader "); + LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); auto LookupResult = MemoryAccessToClass.find(From); bool Changed = false; @@ -1673,11 +1676,11 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, OldClass->setMemoryLeader(nullptr); } else { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); - DEBUG(dbgs() << "Memory class leader change for class " - << OldClass->getID() << " to " - << *OldClass->getMemoryLeader() - << " due to removal of a memory member " << *From - << "\n"); + LLVM_DEBUG(dbgs() << "Memory class leader change for class " + << OldClass->getID() << " to " + << *OldClass->getMemoryLeader() + << " due to removal of a memory member " << *From + << "\n"); markMemoryLeaderChangeTouched(OldClass); } } @@ -1705,7 +1708,7 @@ bool NewGVN::isCycleFree(const Instruction *I) const { if (ICS == ICS_Unknown) { SCCFinder.Start(I); auto &SCC = SCCFinder.getComponentFor(I); - // It's cycle free if it's size 1 or or the SCC is *only* phi nodes. + // It's cycle free if it's size 1 or the SCC is *only* phi nodes. if (SCC.size() == 1) InstCycleState.insert({I, ICS_CycleFree}); else { @@ -1753,12 +1756,13 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, // If it has undef at this point, it means there are no-non-undef arguments, // and thus, the value of the phi node must be undef. if (HasUndef) { - DEBUG(dbgs() << "PHI Node " << *I - << " has no non-undef arguments, valuing it as undef\n"); + LLVM_DEBUG( + dbgs() << "PHI Node " << *I + << " has no non-undef arguments, valuing it as undef\n"); return createConstantExpression(UndefValue::get(I->getType())); } - DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); + LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); deleteExpression(E); return createDeadExpression(); } @@ -1797,8 +1801,8 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, InstrToDFSNum(AllSameValue) > InstrToDFSNum(I)) return E; NumGVNPhisAllSame++; - DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue - << "\n"); + LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue + << "\n"); deleteExpression(E); return createVariableOrConstant(AllSameValue); } @@ -2091,7 +2095,7 @@ void NewGVN::markUsersTouched(Value *V) { } void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const { - DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); + LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); MemoryToUsers[To].insert(U); } @@ -2207,13 +2211,13 @@ Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const { // // - I must be moving to NewClass from OldClass // - The StoreCount of OldClass and NewClass is expected to have been updated -// for I already if it is is a store. +// for I already if it is a store. // - The OldClass memory leader has not been updated yet if I was the leader. void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, MemoryAccess *InstMA, CongruenceClass *OldClass, CongruenceClass *NewClass) { - // If the leader is I, and we had a represenative MemoryAccess, it should + // If the leader is I, and we had a representative MemoryAccess, it should // be the MemoryAccess of OldClass. assert((!InstMA || !OldClass->getMemoryLeader() || OldClass->getLeader() != I || @@ -2227,8 +2231,9 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, (isa<StoreInst>(I) && NewClass->getStoreCount() == 1)); NewClass->setMemoryLeader(InstMA); // Mark it touched if we didn't just create a singleton - DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID() - << " due to new memory instruction becoming leader\n"); + LLVM_DEBUG(dbgs() << "Memory class leader change for class " + << NewClass->getID() + << " due to new memory instruction becoming leader\n"); markMemoryLeaderChangeTouched(NewClass); } setMemoryClass(InstMA, NewClass); @@ -2236,10 +2241,10 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, if (OldClass->getMemoryLeader() == InstMA) { if (!OldClass->definesNoMemory()) { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); - DEBUG(dbgs() << "Memory class leader change for class " - << OldClass->getID() << " to " - << *OldClass->getMemoryLeader() - << " due to removal of old leader " << *InstMA << "\n"); + LLVM_DEBUG(dbgs() << "Memory class leader change for class " + << OldClass->getID() << " to " + << *OldClass->getMemoryLeader() + << " due to removal of old leader " << *InstMA << "\n"); markMemoryLeaderChangeTouched(OldClass); } else OldClass->setMemoryLeader(nullptr); @@ -2276,9 +2281,10 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, NewClass->setStoredValue(SE->getStoredValue()); markValueLeaderChangeTouched(NewClass); // Shift the new class leader to be the store - DEBUG(dbgs() << "Changing leader of congruence class " - << NewClass->getID() << " from " << *NewClass->getLeader() - << " to " << *SI << " because store joined class\n"); + LLVM_DEBUG(dbgs() << "Changing leader of congruence class " + << NewClass->getID() << " from " + << *NewClass->getLeader() << " to " << *SI + << " because store joined class\n"); // If we changed the leader, we have to mark it changed because we don't // know what it will do to symbolic evaluation. NewClass->setLeader(SI); @@ -2298,8 +2304,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // See if we destroyed the class or need to swap leaders. if (OldClass->empty() && OldClass != TOPClass) { if (OldClass->getDefiningExpr()) { - DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() - << " from table\n"); + LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() + << " from table\n"); // We erase it as an exact expression to make sure we don't just erase an // equivalent one. auto Iter = ExpressionToClass.find_as( @@ -2316,8 +2322,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // When the leader changes, the value numbering of // everything may change due to symbolization changes, so we need to // reprocess. - DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID() - << "\n"); + LLVM_DEBUG(dbgs() << "Value class leader change for class " + << OldClass->getID() << "\n"); ++NumGVNLeaderChanges; // Destroy the stored value if there are no more stores to represent it. // Note that this is basically clean up for the expression removal that @@ -2380,12 +2386,14 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { "VariableExpression should have been handled already"); EClass = NewClass; - DEBUG(dbgs() << "Created new congruence class for " << *I - << " using expression " << *E << " at " << NewClass->getID() - << " and leader " << *(NewClass->getLeader())); + LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I + << " using expression " << *E << " at " + << NewClass->getID() << " and leader " + << *(NewClass->getLeader())); if (NewClass->getStoredValue()) - DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue())); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << " and stored value " + << *(NewClass->getStoredValue())); + LLVM_DEBUG(dbgs() << "\n"); } else { EClass = lookupResult.first->second; if (isa<ConstantExpression>(E)) @@ -2403,8 +2411,8 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { bool ClassChanged = IClass != EClass; bool LeaderChanged = LeaderChanges.erase(I); if (ClassChanged || LeaderChanged) { - DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E - << "\n"); + LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " + << *E << "\n"); if (ClassChanged) { moveValueToNewCongruenceClass(I, E, IClass, EClass); markPhiOfOpsChanged(E); @@ -2442,13 +2450,15 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) { if (ReachableEdges.insert({From, To}).second) { // If this block wasn't reachable before, all instructions are touched. if (ReachableBlocks.insert(To).second) { - DEBUG(dbgs() << "Block " << getBlockName(To) << " marked reachable\n"); + LLVM_DEBUG(dbgs() << "Block " << getBlockName(To) + << " marked reachable\n"); const auto &InstRange = BlockInstRange.lookup(To); TouchedInstructions.set(InstRange.first, InstRange.second); } else { - DEBUG(dbgs() << "Block " << getBlockName(To) - << " was reachable, but new edge {" << getBlockName(From) - << "," << getBlockName(To) << "} to it found\n"); + LLVM_DEBUG(dbgs() << "Block " << getBlockName(To) + << " was reachable, but new edge {" + << getBlockName(From) << "," << getBlockName(To) + << "} to it found\n"); // We've made an edge reachable to an existing block, which may // impact predicates. Otherwise, only mark the phi nodes as touched, as @@ -2495,12 +2505,12 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { BasicBlock *FalseSucc = BR->getSuccessor(1); if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) { if (CI->isOne()) { - DEBUG(dbgs() << "Condition for Terminator " << *TI - << " evaluated to true\n"); + LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI + << " evaluated to true\n"); updateReachableEdge(B, TrueSucc); } else if (CI->isZero()) { - DEBUG(dbgs() << "Condition for Terminator " << *TI - << " evaluated to false\n"); + LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI + << " evaluated to false\n"); updateReachableEdge(B, FalseSucc); } } else { @@ -2685,8 +2695,8 @@ Value *NewGVN::findLeaderForInst(Instruction *TransInst, auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB); if (!FoundVal) { ExpressionToPhiOfOps[E].insert(OrigInst); - DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst - << " in block " << getBlockName(PredBB) << "\n"); + LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst + << " in block " << getBlockName(PredBB) << "\n"); return nullptr; } if (auto *SI = dyn_cast<StoreInst>(FoundVal)) @@ -2723,116 +2733,143 @@ NewGVN::makePossiblePHIOfOps(Instruction *I, MemAccess->getDefiningAccess()->getBlock() == I->getParent()) return nullptr; - SmallPtrSet<const Value *, 10> VisitedOps; // Convert op of phis to phi of ops - for (auto *Op : I->operand_values()) { + SmallPtrSet<const Value *, 10> VisitedOps; + SmallVector<Value *, 4> Ops(I->operand_values()); + BasicBlock *SamePHIBlock = nullptr; + PHINode *OpPHI = nullptr; + if (!DebugCounter::shouldExecute(PHIOfOpsCounter)) + return nullptr; + for (auto *Op : Ops) { if (!isa<PHINode>(Op)) { auto *ValuePHI = RealToTemp.lookup(Op); if (!ValuePHI) continue; - DEBUG(dbgs() << "Found possible dependent phi of ops\n"); + LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n"); Op = ValuePHI; } - auto *OpPHI = cast<PHINode>(Op); + OpPHI = cast<PHINode>(Op); + if (!SamePHIBlock) { + SamePHIBlock = getBlockForValue(OpPHI); + } else if (SamePHIBlock != getBlockForValue(OpPHI)) { + LLVM_DEBUG( + dbgs() + << "PHIs for operands are not all in the same block, aborting\n"); + return nullptr; + } // No point in doing this for one-operand phis. - if (OpPHI->getNumOperands() == 1) + if (OpPHI->getNumOperands() == 1) { + OpPHI = nullptr; continue; - if (!DebugCounter::shouldExecute(PHIOfOpsCounter)) - return nullptr; - SmallVector<ValPair, 4> Ops; - SmallPtrSet<Value *, 4> Deps; - auto *PHIBlock = getBlockForValue(OpPHI); - RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I)); - for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) { - auto *PredBB = OpPHI->getIncomingBlock(PredNum); - Value *FoundVal = nullptr; - // We could just skip unreachable edges entirely but it's tricky to do - // with rewriting existing phi nodes. - if (ReachableEdges.count({PredBB, PHIBlock})) { - // Clone the instruction, create an expression from it that is - // translated back into the predecessor, and see if we have a leader. - Instruction *ValueOp = I->clone(); - if (MemAccess) - TempToMemory.insert({ValueOp, MemAccess}); - bool SafeForPHIOfOps = true; - VisitedOps.clear(); - for (auto &Op : ValueOp->operands()) { - auto *OrigOp = &*Op; - // When these operand changes, it could change whether there is a - // leader for us or not, so we have to add additional users. - if (isa<PHINode>(Op)) { - Op = Op->DoPHITranslation(PHIBlock, PredBB); - if (Op != OrigOp && Op != I) - Deps.insert(Op); - } else if (auto *ValuePHI = RealToTemp.lookup(Op)) { - if (getBlockForValue(ValuePHI) == PHIBlock) - Op = ValuePHI->getIncomingValueForBlock(PredBB); - } - // If we phi-translated the op, it must be safe. - SafeForPHIOfOps = - SafeForPHIOfOps && - (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps)); + } + } + + if (!OpPHI) + return nullptr; + + SmallVector<ValPair, 4> PHIOps; + SmallPtrSet<Value *, 4> Deps; + auto *PHIBlock = getBlockForValue(OpPHI); + RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I)); + for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) { + auto *PredBB = OpPHI->getIncomingBlock(PredNum); + Value *FoundVal = nullptr; + SmallPtrSet<Value *, 4> CurrentDeps; + // We could just skip unreachable edges entirely but it's tricky to do + // with rewriting existing phi nodes. + if (ReachableEdges.count({PredBB, PHIBlock})) { + // Clone the instruction, create an expression from it that is + // translated back into the predecessor, and see if we have a leader. + Instruction *ValueOp = I->clone(); + if (MemAccess) + TempToMemory.insert({ValueOp, MemAccess}); + bool SafeForPHIOfOps = true; + VisitedOps.clear(); + for (auto &Op : ValueOp->operands()) { + auto *OrigOp = &*Op; + // When these operand changes, it could change whether there is a + // leader for us or not, so we have to add additional users. + if (isa<PHINode>(Op)) { + Op = Op->DoPHITranslation(PHIBlock, PredBB); + if (Op != OrigOp && Op != I) + CurrentDeps.insert(Op); + } else if (auto *ValuePHI = RealToTemp.lookup(Op)) { + if (getBlockForValue(ValuePHI) == PHIBlock) + Op = ValuePHI->getIncomingValueForBlock(PredBB); } - // FIXME: For those things that are not safe we could generate - // expressions all the way down, and see if this comes out to a - // constant. For anything where that is true, and unsafe, we should - // have made a phi-of-ops (or value numbered it equivalent to something) - // for the pieces already. - FoundVal = !SafeForPHIOfOps ? nullptr - : findLeaderForInst(ValueOp, Visited, - MemAccess, I, PredBB); - ValueOp->deleteValue(); - if (!FoundVal) - return nullptr; - } else { - DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " - << getBlockName(PredBB) - << " because the block is unreachable\n"); - FoundVal = UndefValue::get(I->getType()); - RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); + // If we phi-translated the op, it must be safe. + SafeForPHIOfOps = + SafeForPHIOfOps && + (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps)); } - - Ops.push_back({FoundVal, PredBB}); - DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in " - << getBlockName(PredBB) << "\n"); - } - for (auto Dep : Deps) - addAdditionalUsers(Dep, I); - sortPHIOps(Ops); - auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock); - if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) { - DEBUG(dbgs() - << "Not creating real PHI of ops because it simplified to existing " - "value or constant\n"); - return E; - } - auto *ValuePHI = RealToTemp.lookup(I); - bool NewPHI = false; - if (!ValuePHI) { - ValuePHI = - PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops"); - addPhiOfOps(ValuePHI, PHIBlock, I); - NewPHI = true; - NumGVNPHIOfOpsCreated++; - } - if (NewPHI) { - for (auto PHIOp : Ops) - ValuePHI->addIncoming(PHIOp.first, PHIOp.second); - } else { - unsigned int i = 0; - for (auto PHIOp : Ops) { - ValuePHI->setIncomingValue(i, PHIOp.first); - ValuePHI->setIncomingBlock(i, PHIOp.second); - ++i; + // FIXME: For those things that are not safe we could generate + // expressions all the way down, and see if this comes out to a + // constant. For anything where that is true, and unsafe, we should + // have made a phi-of-ops (or value numbered it equivalent to something) + // for the pieces already. + FoundVal = !SafeForPHIOfOps ? nullptr + : findLeaderForInst(ValueOp, Visited, + MemAccess, I, PredBB); + ValueOp->deleteValue(); + if (!FoundVal) { + // We failed to find a leader for the current ValueOp, but this might + // change in case of the translated operands change. + if (SafeForPHIOfOps) + for (auto Dep : CurrentDeps) + addAdditionalUsers(Dep, I); + + return nullptr; } + Deps.insert(CurrentDeps.begin(), CurrentDeps.end()); + } else { + LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " + << getBlockName(PredBB) + << " because the block is unreachable\n"); + FoundVal = UndefValue::get(I->getType()); + RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); } - RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); - DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I - << "\n"); + PHIOps.push_back({FoundVal, PredBB}); + LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in " + << getBlockName(PredBB) << "\n"); + } + for (auto Dep : Deps) + addAdditionalUsers(Dep, I); + sortPHIOps(PHIOps); + auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock); + if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) { + LLVM_DEBUG( + dbgs() + << "Not creating real PHI of ops because it simplified to existing " + "value or constant\n"); return E; } - return nullptr; + auto *ValuePHI = RealToTemp.lookup(I); + bool NewPHI = false; + if (!ValuePHI) { + ValuePHI = + PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops"); + addPhiOfOps(ValuePHI, PHIBlock, I); + NewPHI = true; + NumGVNPHIOfOpsCreated++; + } + if (NewPHI) { + for (auto PHIOp : PHIOps) + ValuePHI->addIncoming(PHIOp.first, PHIOp.second); + } else { + TempToBlock[ValuePHI] = PHIBlock; + unsigned int i = 0; + for (auto PHIOp : PHIOps) { + ValuePHI->setIncomingValue(i, PHIOp.first); + ValuePHI->setIncomingBlock(i, PHIOp.second); + ++i; + } + } + RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); + LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I + << "\n"); + + return E; } // The algorithm initially places the values of the routine in the TOP @@ -2902,8 +2939,9 @@ void NewGVN::initializeCongruenceClasses(Function &F) { void NewGVN::cleanupTables() { for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) { - DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID() - << " has " << CongruenceClasses[i]->size() << " members\n"); + LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID() + << " has " << CongruenceClasses[i]->size() + << " members\n"); // Make sure we delete the congruence class (probably worth switching to // a unique_ptr at some point. delete CongruenceClasses[i]; @@ -2973,7 +3011,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, // we change its DFS number so that it doesn't get value numbered. if (isInstructionTriviallyDead(&I, TLI)) { InstrDFS[&I] = 0; - DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n"); + LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n"); markInstructionForDeletion(&I); continue; } @@ -3039,9 +3077,10 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; }); if (AllEqual) - DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n"); + LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue + << "\n"); else - DEBUG(dbgs() << "Memory Phi value numbered to itself\n"); + LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n"); // If it's equal to something, it's in that class. Otherwise, it has to be in // a class where it is the leader (other things may be equivalent to it, but // it needs to start off in its own class, which means it must have been the @@ -3060,7 +3099,7 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { // Value number a single instruction, symbolically evaluating, performing // congruence finding, and updating mappings. void NewGVN::valueNumberInstruction(Instruction *I) { - DEBUG(dbgs() << "Processing instruction " << *I << "\n"); + LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n"); if (!I->isTerminator()) { const Expression *Symbolized = nullptr; SmallPtrSet<Value *, 2> Visited; @@ -3246,7 +3285,7 @@ void NewGVN::verifyMemoryCongruency() const { // and redoing the iteration to see if anything changed. void NewGVN::verifyIterationSettled(Function &F) { #ifndef NDEBUG - DEBUG(dbgs() << "Beginning iteration verification\n"); + LLVM_DEBUG(dbgs() << "Beginning iteration verification\n"); if (DebugCounter::isCounterSet(VNCounter)) DebugCounter::setCounterValue(VNCounter, StartingVNCounter); @@ -3364,9 +3403,9 @@ void NewGVN::iterateTouchedInstructions() { // If it's not reachable, erase any touched instructions and move on. if (!BlockReachable) { TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second); - DEBUG(dbgs() << "Skipping instructions in block " - << getBlockName(CurrBlock) - << " because it is unreachable\n"); + LLVM_DEBUG(dbgs() << "Skipping instructions in block " + << getBlockName(CurrBlock) + << " because it is unreachable\n"); continue; } updateProcessedCount(CurrBlock); @@ -3376,7 +3415,7 @@ void NewGVN::iterateTouchedInstructions() { TouchedInstructions.reset(InstrNum); if (auto *MP = dyn_cast<MemoryPhi>(V)) { - DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n"); + LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n"); valueNumberMemoryPhi(MP); } else if (auto *I = dyn_cast<Instruction>(V)) { valueNumberInstruction(I); @@ -3422,10 +3461,10 @@ bool NewGVN::runGVN() { for (auto &B : RPOT) { auto *Node = DT->getNode(B); if (Node->getChildren().size() > 1) - std::sort(Node->begin(), Node->end(), - [&](const DomTreeNode *A, const DomTreeNode *B) { - return RPOOrdering[A] < RPOOrdering[B]; - }); + llvm::sort(Node->begin(), Node->end(), + [&](const DomTreeNode *A, const DomTreeNode *B) { + return RPOOrdering[A] < RPOOrdering[B]; + }); } // Now a standard depth first ordering of the domtree is equivalent to RPO. @@ -3446,8 +3485,8 @@ bool NewGVN::runGVN() { // Initialize the touched instructions to include the entry block. const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock()); TouchedInstructions.set(InstRange.first, InstRange.second); - DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock()) - << " marked reachable\n"); + LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock()) + << " marked reachable\n"); ReachableBlocks.insert(&F.getEntryBlock()); iterateTouchedInstructions(); @@ -3472,8 +3511,8 @@ bool NewGVN::runGVN() { }; for (auto &BB : make_filter_range(F, UnreachableBlockPred)) { - DEBUG(dbgs() << "We believe block " << getBlockName(&BB) - << " is unreachable\n"); + LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB) + << " is unreachable\n"); deleteInstructionsInBlock(&BB); Changed = true; } @@ -3695,7 +3734,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { } void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { - DEBUG(dbgs() << " BasicBlock Dead:" << *BB); + LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB); ++NumGVNBlocksDeleted; // Delete the instructions backwards, as it has a reduced likelihood of having @@ -3722,12 +3761,12 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { } void NewGVN::markInstructionForDeletion(Instruction *I) { - DEBUG(dbgs() << "Marking " << *I << " for deletion\n"); + LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n"); InstructionsToErase.insert(I); } void NewGVN::replaceInstruction(Instruction *I, Value *V) { - DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n"); + LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n"); patchAndReplaceAllUsesWith(I, V); // We save the actual erasing to avoid invalidating memory // dependencies until we are done with everything. @@ -3853,9 +3892,10 @@ bool NewGVN::eliminateInstructions(Function &F) { auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) { for (auto &Operand : PHI->incoming_values()) if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) { - DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block " - << getBlockName(PHI->getIncomingBlock(Operand)) - << " with undef due to it being unreachable\n"); + LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI + << " for block " + << getBlockName(PHI->getIncomingBlock(Operand)) + << " with undef due to it being unreachable\n"); Operand.set(UndefValue::get(PHI->getType())); } }; @@ -3887,7 +3927,8 @@ bool NewGVN::eliminateInstructions(Function &F) { // Map to store the use counts DenseMap<const Value *, unsigned int> UseCounts; for (auto *CC : reverse(CongruenceClasses)) { - DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n"); + LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() + << "\n"); // Track the equivalent store info so we can decide whether to try // dead store elimination. SmallVector<ValueDFS, 8> PossibleDeadStores; @@ -3925,8 +3966,8 @@ bool NewGVN::eliminateInstructions(Function &F) { MembersLeft.insert(Member); continue; } - DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member - << "\n"); + LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " + << *Member << "\n"); auto *I = cast<Instruction>(Member); assert(Leader != I && "About to accidentally remove our leader"); replaceInstruction(I, Leader); @@ -3947,7 +3988,7 @@ bool NewGVN::eliminateInstructions(Function &F) { convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead); // Sort the whole thing. - std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end()); + llvm::sort(DFSOrderedSet.begin(), DFSOrderedSet.end()); for (auto &VD : DFSOrderedSet) { int MemberDFSIn = VD.DFSIn; int MemberDFSOut = VD.DFSOut; @@ -3966,24 +4007,24 @@ bool NewGVN::eliminateInstructions(Function &F) { // remove from temp instruction list. AllTempInstructions.erase(PN); auto *DefBlock = getBlockForValue(Def); - DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def - << " into block " - << getBlockName(getBlockForValue(Def)) << "\n"); + LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def + << " into block " + << getBlockName(getBlockForValue(Def)) << "\n"); PN->insertBefore(&DefBlock->front()); Def = PN; NumGVNPHIOfOpsEliminations++; } if (EliminationStack.empty()) { - DEBUG(dbgs() << "Elimination Stack is empty\n"); + LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n"); } else { - DEBUG(dbgs() << "Elimination Stack Top DFS numbers are (" - << EliminationStack.dfs_back().first << "," - << EliminationStack.dfs_back().second << ")\n"); + LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are (" + << EliminationStack.dfs_back().first << "," + << EliminationStack.dfs_back().second << ")\n"); } - DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << "," - << MemberDFSOut << ")\n"); + LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << "," + << MemberDFSOut << ")\n"); // First, we see if we are out of scope or empty. If so, // and there equivalences, we try to replace the top of // stack with equivalences (if it's on the stack, it must @@ -4058,14 +4099,16 @@ bool NewGVN::eliminateInstructions(Function &F) { Value *DominatingLeader = EliminationStack.back(); auto *II = dyn_cast<IntrinsicInst>(DominatingLeader); - if (II && II->getIntrinsicID() == Intrinsic::ssa_copy) + bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy; + if (isSSACopy) DominatingLeader = II->getOperand(0); // Don't replace our existing users with ourselves. if (U->get() == DominatingLeader) continue; - DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for " - << *U->get() << " in " << *(U->getUser()) << "\n"); + LLVM_DEBUG(dbgs() + << "Found replacement " << *DominatingLeader << " for " + << *U->get() << " in " << *(U->getUser()) << "\n"); // If we replaced something in an instruction, handle the patching of // metadata. Skip this if we are replacing predicateinfo with its @@ -4081,7 +4124,9 @@ bool NewGVN::eliminateInstructions(Function &F) { // It's about to be alive again. if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader)) ProbablyDead.erase(cast<Instruction>(DominatingLeader)); - if (LeaderUseCount == 0 && II) + // Copy instructions, however, are still dead because we use their + // operand as the leader. + if (LeaderUseCount == 0 && isSSACopy) ProbablyDead.insert(II); ++LeaderUseCount; AnythingReplaced = true; @@ -4106,7 +4151,7 @@ bool NewGVN::eliminateInstructions(Function &F) { // If we have possible dead stores to look at, try to eliminate them. if (CC->getStoreCount() > 0) { convertClassToLoadsAndStores(*CC, PossibleDeadStores); - std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end()); + llvm::sort(PossibleDeadStores.begin(), PossibleDeadStores.end()); ValueDFSStack EliminationStack; for (auto &VD : PossibleDeadStores) { int MemberDFSIn = VD.DFSIn; @@ -4129,8 +4174,8 @@ bool NewGVN::eliminateInstructions(Function &F) { (void)Leader; assert(DT->dominates(Leader->getParent(), Member->getParent())); // Member is dominater by Leader, and thus dead - DEBUG(dbgs() << "Marking dead store " << *Member - << " that is dominated by " << *Leader << "\n"); + LLVM_DEBUG(dbgs() << "Marking dead store " << *Member + << " that is dominated by " << *Leader << "\n"); markInstructionForDeletion(Member); CC->erase(Member); ++NumGVNDeadStores; diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 2d0cb6fbf211..8f30bccf48f1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -55,6 +55,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" @@ -65,7 +66,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" #define DEBUG_TYPE "safepoint-placement" @@ -323,7 +323,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // avoiding the runtime cost of the actual safepoint. if (!AllBackedges) { if (mustBeFiniteCountedLoop(L, SE, Pred)) { - DEBUG(dbgs() << "skipping safepoint placement in finite loop\n"); + LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n"); FiniteExecution++; continue; } @@ -332,7 +332,9 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // Note: This is only semantically legal since we won't do any further // IPO or inlining before the actual call insertion.. If we hadn't, we // might latter loose this call safepoint. - DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n"); + LLVM_DEBUG( + dbgs() + << "skipping safepoint placement due to unconditional call\n"); CallInLoop++; continue; } @@ -348,7 +350,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // variables) and branches to the true header TerminatorInst *Term = Pred->getTerminator(); - DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term); + LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term); PollLocations.push_back(Term); } @@ -522,7 +524,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { }; // We need the order of list to be stable so that naming ends up stable // when we split edges. This makes test cases much easier to write. - std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName); + llvm::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName); // We can sometimes end up with duplicate poll locations. This happens if // a single loop is visited more than once. The fact this happens seems diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 88dcaf0f8a36..c81ac70d99e6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -42,6 +43,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -55,7 +57,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <utility> @@ -168,8 +169,8 @@ void ReassociatePass::BuildRankMap(Function &F, // Assign distinct ranks to function arguments. for (auto &Arg : F.args()) { ValueRankMap[&Arg] = ++Rank; - DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank - << "\n"); + LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank + << "\n"); } // Traverse basic blocks in ReversePostOrder @@ -200,17 +201,17 @@ unsigned ReassociatePass::getRank(Value *V) { // for PHI nodes, we cannot have infinite recursion here, because there // cannot be loops in the value graph that do not go through PHI nodes. unsigned Rank = 0, MaxRank = RankMap[I->getParent()]; - for (unsigned i = 0, e = I->getNumOperands(); - i != e && Rank != MaxRank; ++i) + for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i) Rank = std::max(Rank, getRank(I->getOperand(i))); // If this is a not or neg instruction, do not count it for rank. This // assures us that X and ~X will have the same rank. - if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && - !BinaryOperator::isFNeg(I)) + if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && + !BinaryOperator::isFNeg(I)) ++Rank; - DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n"); + LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank + << "\n"); return ValueRankMap[I] = Rank; } @@ -445,7 +446,7 @@ using RepeatedValue = std::pair<Value*, APInt>; /// type and thus make the expression bigger. static bool LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<RepeatedValue> &Ops) { - DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); + LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); assert(I->isAssociative() && I->isCommutative() && @@ -494,14 +495,14 @@ static bool LinearizeExprTree(BinaryOperator *I, for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands. Value *Op = I->getOperand(OpIdx); APInt Weight = P.second; // Number of paths to this operand. - DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); + LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); assert(!Op->use_empty() && "No uses, so how did we get to it?!"); // If this is a binary operation of the right kind with only one use then // add its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { assert(Visited.insert(Op).second && "Not first visit!"); - DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); + LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); Worklist.push_back(std::make_pair(BO, Weight)); continue; } @@ -514,7 +515,8 @@ static bool LinearizeExprTree(BinaryOperator *I, if (!Op->hasOneUse()) { // This value has uses not accounted for by the expression, so it is // not safe to modify. Mark it as being a leaf. - DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); + LLVM_DEBUG(dbgs() + << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); LeafOrder.push_back(Op); Leaves[Op] = Weight; continue; @@ -540,7 +542,7 @@ static bool LinearizeExprTree(BinaryOperator *I, // to the expression, then no longer consider it to be a leaf and add // its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { - DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); + LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); Worklist.push_back(std::make_pair(BO, It->second)); Leaves.erase(It); continue; @@ -573,9 +575,10 @@ static bool LinearizeExprTree(BinaryOperator *I, if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) || (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) { - DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); + LLVM_DEBUG(dbgs() + << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); BO = LowerNegateToMultiply(BO); - DEBUG(dbgs() << *BO << '\n'); + LLVM_DEBUG(dbgs() << *BO << '\n'); Worklist.push_back(std::make_pair(BO, Weight)); Changed = true; continue; @@ -583,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I, // Failed to morph into an expression of the right type. This really is // a leaf. - DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); + LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); LeafOrder.push_back(Op); Leaves[Op] = Weight; @@ -675,9 +678,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, if (NewLHS == OldRHS && NewRHS == OldLHS) { // The order of the operands was reversed. Swap them. - DEBUG(dbgs() << "RA: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); Op->swapOperands(); - DEBUG(dbgs() << "TO: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; break; @@ -685,7 +688,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, // The new operation differs non-trivially from the original. Overwrite // the old operands with the new ones. - DEBUG(dbgs() << "RA: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewLHS != OldLHS) { BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); if (BO && !NotRewritable.count(BO)) @@ -698,7 +701,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); } - DEBUG(dbgs() << "TO: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); ExpressionChanged = Op; MadeChange = true; @@ -711,7 +714,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, // while the right-hand side will be the current element of Ops. Value *NewRHS = Ops[i].Op; if (NewRHS != Op->getOperand(1)) { - DEBUG(dbgs() << "RA: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewRHS == Op->getOperand(0)) { // The new right-hand side was already present as the left operand. If // we are lucky then swapping the operands will sort out both of them. @@ -724,7 +727,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, Op->setOperand(1, NewRHS); ExpressionChanged = Op; } - DEBUG(dbgs() << "TO: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; } @@ -756,9 +759,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, NewOp = NodesToRewrite.pop_back_val(); } - DEBUG(dbgs() << "RA: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n'); Op->setOperand(0, NewOp); - DEBUG(dbgs() << "TO: " << *Op << '\n'); + LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n'); ExpressionChanged = Op; MadeChange = true; ++NumChanged; @@ -781,6 +784,18 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, if (ExpressionChanged == I) break; + + // Discard any debug info related to the expressions that has changed (we + // can leave debug infor related to the root, since the result of the + // expression tree should be the same even after reassociation). + SmallVector<DbgInfoIntrinsic *, 1> DbgUsers; + findDbgUsers(DbgUsers, ExpressionChanged); + for (auto *DII : DbgUsers) { + Value *Undef = UndefValue::get(ExpressionChanged->getType()); + DII->setOperand(0, MetadataAsValue::get(DII->getContext(), + ValueAsMetadata::get(Undef))); + } + ExpressionChanged->moveBefore(I); ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin()); } while (true); @@ -798,7 +813,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, /// pushing the negates through adds. These will be revisited to see if /// additional opportunities have been exposed. static Value *NegateValue(Value *V, Instruction *BI, - SetVector<AssertingVH<Instruction>> &ToRedo) { + ReassociatePass::OrderedSet &ToRedo) { if (auto *C = dyn_cast<Constant>(V)) return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) : ConstantExpr::getNeg(C); @@ -912,8 +927,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator * -BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { +static BinaryOperator *BreakUpSubtract(Instruction *Sub, + ReassociatePass::OrderedSet &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // @@ -929,7 +944,7 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { Sub->replaceAllUsesWith(New); New->setDebugLoc(Sub->getDebugLoc()); - DEBUG(dbgs() << "Negated: " << *New << '\n'); + LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n'); return New; } @@ -1415,7 +1430,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp + << '\n'); ++NumFactor; // Insert a new multiply. @@ -1553,7 +1569,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { - DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal + << '\n'); ++NumFactor; // Create a new instruction that uses the MaxOccVal twice. If we don't do @@ -1622,7 +1639,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, return nullptr; } -/// \brief Build up a vector of value/power pairs factoring a product. +/// Build up a vector of value/power pairs factoring a product. /// /// Given a series of multiplication operands, build a vector of factors and /// the powers each is raised to when forming the final product. Sort them in @@ -1687,7 +1704,7 @@ static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, return true; } -/// \brief Build a tree of multiplies, computing the product of Ops. +/// Build a tree of multiplies, computing the product of Ops. static Value *buildMultiplyTree(IRBuilder<> &Builder, SmallVectorImpl<Value*> &Ops) { if (Ops.size() == 1) @@ -1704,7 +1721,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder, return LHS; } -/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... +/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... /// /// Given a vector of values raised to various powers, where no two values are /// equal and the powers are sorted in decreasing order, compute the minimal @@ -1859,8 +1876,8 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, // Remove dead instructions and if any operands are trivially dead add them to // Insts so they will be removed as well. -void ReassociatePass::RecursivelyEraseDeadInsts( - Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) { +void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, + OrderedSet &Insts) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end()); ValueRankMap.erase(I); @@ -1876,7 +1893,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts( /// Zap the given instruction, adding interesting operands to the work list. void ReassociatePass::EraseInst(Instruction *I) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); - DEBUG(dbgs() << "Erasing dead inst: "; I->dump()); + LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump()); SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); // Erase the dead instruction. @@ -1893,7 +1910,14 @@ void ReassociatePass::EraseInst(Instruction *I) { while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && Visited.insert(Op).second) Op = Op->user_back(); - RedoInsts.insert(Op); + + // The instruction we're going to push may be coming from a + // dead block, and Reassociate skips the processing of unreachable + // blocks because it's a waste of time and also because it can + // lead to infinite loop due to LLVM's non-standard definition + // of dominance. + if (ValueRankMap.find(Op) != ValueRankMap.end()) + RedoInsts.insert(Op); } MadeChange = true; @@ -2120,7 +2144,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { ValueEntry(getRank(E.first), E.first)); } - DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); // Now that we have linearized the tree to a list and have gathered all of // the operands and their ranks, sort the operands by their rank. Use a @@ -2138,7 +2162,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { return; // This expression tree simplified to something that isn't a tree, // eliminate it. - DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); + LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); if (Instruction *VI = dyn_cast<Instruction>(V)) if (I->getDebugLoc()) @@ -2169,7 +2193,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { } } - DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); if (Ops.size() == 1) { if (Ops[0].Op == I) @@ -2321,7 +2345,7 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { // Make a copy of all the instructions to be redone so we can remove dead // instructions. - SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts); + OrderedSet ToRedo(RedoInsts); // Iterate over all instructions to be reevaluated and remove trivially dead // instructions. If any operand of the trivially dead instruction becomes // dead mark it for deletion as well. Continue this process until all @@ -2337,7 +2361,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { // Now that we have removed dead instructions, we can reoptimize the // remaining instructions. while (!RedoInsts.empty()) { - Instruction *I = RedoInsts.pop_back_val(); + Instruction *I = RedoInsts.front(); + RedoInsts.erase(RedoInsts.begin()); if (isInstructionTriviallyDead(I)) EraseInst(I); else diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 96295683314c..018feb035a4f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" @@ -25,7 +26,7 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils.h" #include <list> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index c44edbed8ed9..391e43f79121 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -64,7 +65,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> @@ -476,6 +476,12 @@ findBaseDefiningValueOfVector(Value *I) { if (auto *BC = dyn_cast<BitCastInst>(I)) return findBaseDefiningValue(BC->getOperand(0)); + // We assume that functions in the source language only return base + // pointers. This should probably be generalized via attributes to support + // both source language and internal functions. + if (isa<CallInst>(I) || isa<InvokeInst>(I)) + return BaseDefiningValueResult(I, true); + // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && @@ -610,8 +616,8 @@ static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { Value *&Cached = Cache[I]; if (!Cached) { Cached = findBaseDefiningValue(I).BDV; - DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " - << Cached->getName() << "\n"); + LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " + << Cached->getName() << "\n"); } assert(Cache[I] != nullptr); return Cached; @@ -842,9 +848,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { } #ifndef NDEBUG - DEBUG(dbgs() << "States after initialization:\n"); + LLVM_DEBUG(dbgs() << "States after initialization:\n"); for (auto Pair : States) { - DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); + LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif @@ -917,9 +923,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { } #ifndef NDEBUG - DEBUG(dbgs() << "States after meet iteration:\n"); + LLVM_DEBUG(dbgs() << "States after meet iteration:\n"); for (auto Pair : States) { - DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); + LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif @@ -960,7 +966,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { if (isa<PHINode>(I)) { BasicBlock *BB = I->getParent(); - int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); + int NumPreds = pred_size(BB); assert(NumPreds > 0 && "how did we reach here"); std::string Name = suffixed_name_or(I, ".base", "base_phi"); return PHINode::Create(I->getType(), NumPreds, Name, I); @@ -1118,10 +1124,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { assert(BDV && Base); assert(!isKnownBaseResult(BDV) && "why did it get added?"); - DEBUG(dbgs() << "Updating base value cache" - << " for: " << BDV->getName() << " from: " - << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none") - << " to: " << Base->getName() << "\n"); + LLVM_DEBUG( + dbgs() << "Updating base value cache" + << " for: " << BDV->getName() << " from: " + << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none") + << " to: " << Base->getName() << "\n"); if (Cache.count(BDV)) { assert(isKnownBaseResult(Base) && @@ -1369,7 +1376,7 @@ public: assert(OldI != NewI && "Disallowed at construction?!"); assert((!IsDeoptimize || !New) && - "Deoptimize instrinsics are not replaced!"); + "Deoptimize intrinsics are not replaced!"); Old = nullptr; New = nullptr; @@ -1379,7 +1386,7 @@ public: if (IsDeoptimize) { // Note: we've inserted instructions, so the call to llvm.deoptimize may - // not necessarilly be followed by the matching return. + // not necessarily be followed by the matching return. auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator()); new UnreachableInst(RI->getContext(), RI); RI->eraseFromParent(); @@ -1805,7 +1812,7 @@ static void relocationViaAlloca( SmallVector<Instruction *, 20> Uses; // PERF: trade a linear scan for repeated reallocation - Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); + Uses.reserve(Def->getNumUses()); for (User *U : Def->users()) { if (!isa<ConstantExpr>(U)) { // If the def has a ConstantExpr use, then the def is either a @@ -1817,7 +1824,7 @@ static void relocationViaAlloca( } } - std::sort(Uses.begin(), Uses.end()); + llvm::sort(Uses.begin(), Uses.end()); auto Last = std::unique(Uses.begin(), Uses.end()); Uses.erase(Last, Uses.end()); @@ -1977,7 +1984,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, Cost += 2; } else { - llvm_unreachable("unsupported instruciton type during rematerialization"); + llvm_unreachable("unsupported instruction type during rematerialization"); } } @@ -2024,7 +2031,7 @@ static void rematerializeLiveValues(CallSite CS, SmallVector<Value *, 32> LiveValuesToBeDeleted; for (Value *LiveValue: Info.LiveSet) { - // For each live pointer find it's defining chain + // For each live pointer find its defining chain SmallVector<Instruction *, 3> ChainToBase; assert(Info.PointerToBase.count(LiveValue)); Value *RootOfChain = @@ -2461,22 +2468,8 @@ static void stripNonValidDataFromBody(Function &F) { continue; } - if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { - assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); - bool IsImmutableTBAA = - MD->getNumOperands() == 4 && - mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1; - - if (!IsImmutableTBAA) - continue; // no work to do, MD_tbaa is already marked mutable - - MDNode *Base = cast<MDNode>(MD->getOperand(0)); - MDNode *Access = cast<MDNode>(MD->getOperand(1)); - uint64_t Offset = - mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue(); - - MDNode *MutableTBAA = - Builder.createTBAAStructTagNode(Base, Access, Offset); + if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) { + MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag); I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); } @@ -2537,30 +2530,31 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, return false; }; + + // Delete any unreachable statepoints so that we don't have unrewritten + // statepoints surviving this pass. This makes testing easier and the + // resulting IR less confusing to human readers. + DeferredDominance DD(DT); + bool MadeChange = removeUnreachableBlocks(F, nullptr, &DD); + DD.flush(); + // Gather all the statepoints which need rewritten. Be careful to only // consider those in reachable code since we need to ask dominance queries // when rewriting. We'll delete the unreachable ones in a moment. SmallVector<CallSite, 64> ParsePointNeeded; - bool HasUnreachableStatepoint = false; for (Instruction &I : instructions(F)) { // TODO: only the ones with the flag set! if (NeedsRewrite(I)) { - if (DT.isReachableFromEntry(I.getParent())) - ParsePointNeeded.push_back(CallSite(&I)); - else - HasUnreachableStatepoint = true; + // NOTE removeUnreachableBlocks() is stronger than + // DominatorTree::isReachableFromEntry(). In other words + // removeUnreachableBlocks can remove some blocks for which + // isReachableFromEntry() returns true. + assert(DT.isReachableFromEntry(I.getParent()) && + "no unreachable blocks expected"); + ParsePointNeeded.push_back(CallSite(&I)); } } - bool MadeChange = false; - - // Delete any unreachable statepoints so that we don't have unrewritten - // statepoints surviving this pass. This makes testing easier and the - // resulting IR less confusing to human readers. Rather than be fancy, we - // just reuse a utility function which removes the unreachable blocks. - if (HasUnreachableStatepoint) - MadeChange |= removeUnreachableBlocks(F); - // Return early if no work to do. if (ParsePointNeeded.empty()) return MadeChange; diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 3e12649ddedc..5e3ddeda2d49 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -17,7 +17,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -30,6 +29,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/BasicBlock.h" @@ -54,9 +54,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <utility> #include <vector> @@ -71,8 +69,6 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP"); STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP"); STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP"); -STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by" - "IPSCCP"); namespace { @@ -261,7 +257,7 @@ public: bool MarkBlockExecutable(BasicBlock *BB) { if (!BBExecutable.insert(BB).second) return false; - DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); + LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); BBWorkList.push_back(BB); // Add the block to the work list! return true; } @@ -329,6 +325,10 @@ public: return BBExecutable.count(BB); } + // isEdgeFeasible - Return true if the control flow edge from the 'From' basic + // block to the 'To' basic block is currently feasible. + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const { std::vector<LatticeVal> StructValues; auto *STy = dyn_cast<StructType>(V->getType()); @@ -341,20 +341,13 @@ public: return StructValues; } - ValueLatticeElement getLatticeValueFor(Value *V) { + const LatticeVal &getLatticeValueFor(Value *V) const { assert(!V->getType()->isStructTy() && "Should use getStructLatticeValueFor"); - std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool> - PI = ParamState.insert(std::make_pair(V, ValueLatticeElement())); - ValueLatticeElement &LV = PI.first->second; - if (PI.second) { - DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); - assert(I != ValueState.end() && - "V not found in ValueState nor Paramstate map!"); - LV = I->second.toValueLattice(); - } - - return LV; + DenseMap<Value *, LatticeVal>::const_iterator I = ValueState.find(V); + assert(I != ValueState.end() && + "V not found in ValueState nor Paramstate map!"); + return I->second; } /// getTrackedRetVals - Get the inferred return value map. @@ -415,55 +408,57 @@ private: // markConstant - Make a value be marked as "constant". If the value // is not already a constant, add it to the instruction work list so that // the users of the instruction are updated later. - void markConstant(LatticeVal &IV, Value *V, Constant *C) { - if (!IV.markConstant(C)) return; - DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); + bool markConstant(LatticeVal &IV, Value *V, Constant *C) { + if (!IV.markConstant(C)) return false; + LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); pushToWorkList(IV, V); + return true; } - void markConstant(Value *V, Constant *C) { + bool markConstant(Value *V, Constant *C) { assert(!V->getType()->isStructTy() && "structs should use mergeInValue"); - markConstant(ValueState[V], V, C); + return markConstant(ValueState[V], V, C); } void markForcedConstant(Value *V, Constant *C) { assert(!V->getType()->isStructTy() && "structs should use mergeInValue"); LatticeVal &IV = ValueState[V]; IV.markForcedConstant(C); - DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); + LLVM_DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); pushToWorkList(IV, V); } // markOverdefined - Make a value be marked as "overdefined". If the // value is not already overdefined, add it to the overdefined instruction // work list so that the users of the instruction are updated later. - void markOverdefined(LatticeVal &IV, Value *V) { - if (!IV.markOverdefined()) return; - - DEBUG(dbgs() << "markOverdefined: "; - if (auto *F = dyn_cast<Function>(V)) - dbgs() << "Function '" << F->getName() << "'\n"; - else - dbgs() << *V << '\n'); + bool markOverdefined(LatticeVal &IV, Value *V) { + if (!IV.markOverdefined()) return false; + + LLVM_DEBUG(dbgs() << "markOverdefined: "; + if (auto *F = dyn_cast<Function>(V)) dbgs() + << "Function '" << F->getName() << "'\n"; + else dbgs() << *V << '\n'); // Only instructions go on the work list pushToWorkList(IV, V); + return true; } - void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) { + bool mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) { if (IV.isOverdefined() || MergeWithV.isUnknown()) - return; // Noop. + return false; // Noop. if (MergeWithV.isOverdefined()) return markOverdefined(IV, V); if (IV.isUnknown()) return markConstant(IV, V, MergeWithV.getConstant()); if (IV.getConstant() != MergeWithV.getConstant()) return markOverdefined(IV, V); + return false; } - void mergeInValue(Value *V, LatticeVal MergeWithV) { + bool mergeInValue(Value *V, LatticeVal MergeWithV) { assert(!V->getType()->isStructTy() && "non-structs should use markConstant"); - mergeInValue(ValueState[V], V, MergeWithV); + return mergeInValue(ValueState[V], V, MergeWithV); } /// getValueState - Return the LatticeVal object that corresponds to the @@ -534,30 +529,27 @@ private: /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB /// work list if it is not already executable. - void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { + bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second) - return; // This edge is already known to be executable! + return false; // This edge is already known to be executable! if (!MarkBlockExecutable(Dest)) { // If the destination is already executable, we just made an *edge* // feasible that wasn't before. Revisit the PHI nodes in the block // because they have potentially new operands. - DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() - << " -> " << Dest->getName() << '\n'); + LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() + << " -> " << Dest->getName() << '\n'); for (PHINode &PN : Dest->phis()) visitPHINode(PN); } + return true; } // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs); - // isEdgeFeasible - Return true if the control flow edge from the 'From' basic - // block to the 'To' basic block is currently feasible. - bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); - // OperandChangedState - This method is invoked on all of the users of an // instruction that was just changed state somehow. Based on this // information, we need to update the specified user of this instruction. @@ -614,7 +606,7 @@ private: void visitInstruction(Instruction &I) { // All the instructions we don't do any special handling for just // go to overdefined. - DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); + LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); markOverdefined(&I); } }; @@ -701,68 +693,17 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, return; } - DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n'); + LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n'); llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { - assert(BBExecutable.count(To) && "Dest should always be alive!"); - - // Make sure the source basic block is executable!! - if (!BBExecutable.count(From)) return false; - - // Check to make sure this edge itself is actually feasible now. - TerminatorInst *TI = From->getTerminator(); - if (auto *BI = dyn_cast<BranchInst>(TI)) { - if (BI->isUnconditional()) - return true; - - LatticeVal BCValue = getValueState(BI->getCondition()); - - // Overdefined condition variables mean the branch could go either way, - // undef conditions mean that neither edge is feasible yet. - ConstantInt *CI = BCValue.getConstantInt(); - if (!CI) - return !BCValue.isUnknown(); - - // Constant condition variables mean the branch can only go a single way. - return BI->getSuccessor(CI->isZero()) == To; - } - - // Unwinding instructions successors are always executable. - if (TI->isExceptional()) - return true; - - if (auto *SI = dyn_cast<SwitchInst>(TI)) { - if (SI->getNumCases() < 1) - return true; - - LatticeVal SCValue = getValueState(SI->getCondition()); - ConstantInt *CI = SCValue.getConstantInt(); - - if (!CI) - return !SCValue.isUnknown(); - - return SI->findCaseValue(CI)->getCaseSuccessor() == To; - } - - // In case of indirect branch and its address is a blockaddress, we mark - // the target as executable. - if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) { - LatticeVal IBRValue = getValueState(IBR->getAddress()); - BlockAddress *Addr = IBRValue.getBlockAddress(); - - if (!Addr) - return !IBRValue.isUnknown(); - - // At this point, the indirectbr is branching on a blockaddress. - return Addr->getBasicBlock() == To; - } - - DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n'); - llvm_unreachable("SCCP: Don't know how to handle this terminator!"); + // Check if we've called markEdgeExecutable on the edge yet. (We could + // be more aggressive and try to consider edges which haven't been marked + // yet, but there isn't any need.) + return KnownFeasibleEdges.count(Edge(From, To)); } // visit Implementations - Something changed in this instruction, either an @@ -786,7 +727,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // If this PN returns a struct, just mark the result overdefined. // TODO: We could do a lot better than this if code actually uses this. if (PN.getType()->isStructTy()) - return markOverdefined(&PN); + return (void)markOverdefined(&PN); if (getValueState(&PN).isOverdefined()) return; // Quick exit @@ -794,7 +735,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) - return markOverdefined(&PN); + return (void)markOverdefined(&PN); // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all @@ -810,7 +751,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { continue; if (IV.isOverdefined()) // PHI node becomes overdefined! - return markOverdefined(&PN); + return (void)markOverdefined(&PN); if (!OperandVal) { // Grab the first value. OperandVal = IV.getConstant(); @@ -824,7 +765,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // Check to see if there are two different constants merging, if so, the PHI // node is overdefined. if (IV.getConstant() != OperandVal) - return markOverdefined(&PN); + return (void)markOverdefined(&PN); } // If we exited the loop, this means that the PHI node only has constant @@ -892,11 +833,11 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { // If this returns a struct, mark all elements over defined, we don't track // structs in structs. if (EVI.getType()->isStructTy()) - return markOverdefined(&EVI); + return (void)markOverdefined(&EVI); // If this is extracting from more than one level of struct, we don't know. if (EVI.getNumIndices() != 1) - return markOverdefined(&EVI); + return (void)markOverdefined(&EVI); Value *AggVal = EVI.getAggregateOperand(); if (AggVal->getType()->isStructTy()) { @@ -905,19 +846,19 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { mergeInValue(getValueState(&EVI), &EVI, EltVal); } else { // Otherwise, must be extracting from an array. - return markOverdefined(&EVI); + return (void)markOverdefined(&EVI); } } void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { auto *STy = dyn_cast<StructType>(IVI.getType()); if (!STy) - return markOverdefined(&IVI); + return (void)markOverdefined(&IVI); // If this has more than one index, we can't handle it, drive all results to // undef. if (IVI.getNumIndices() != 1) - return markOverdefined(&IVI); + return (void)markOverdefined(&IVI); Value *Aggr = IVI.getAggregateOperand(); unsigned Idx = *IVI.idx_begin(); @@ -946,7 +887,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) { // If this select returns a struct, just mark the result overdefined. // TODO: We could do a lot better than this if code actually uses this. if (I.getType()->isStructTy()) - return markOverdefined(&I); + return (void)markOverdefined(&I); LatticeVal CondValue = getValueState(I.getCondition()); if (CondValue.isUnknown()) @@ -967,12 +908,12 @@ void SCCPSolver::visitSelectInst(SelectInst &I) { // select ?, C, C -> C. if (TVal.isConstant() && FVal.isConstant() && TVal.getConstant() == FVal.getConstant()) - return markConstant(&I, FVal.getConstant()); + return (void)markConstant(&I, FVal.getConstant()); if (TVal.isUnknown()) // select ?, undef, X -> X. - return mergeInValue(&I, FVal); + return (void)mergeInValue(&I, FVal); if (FVal.isUnknown()) // select ?, X, undef -> X. - return mergeInValue(&I, TVal); + return (void)mergeInValue(&I, TVal); markOverdefined(&I); } @@ -990,7 +931,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // X op Y -> undef. if (isa<UndefValue>(C)) return; - return markConstant(IV, &I, C); + return (void)markConstant(IV, &I, C); } // If something is undef, wait for it to resolve. @@ -1003,7 +944,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // overdefined, and we can replace it with zero. if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv) if (V1State.isConstant() && V1State.getConstant()->isNullValue()) - return markConstant(IV, &I, V1State.getConstant()); + return (void)markConstant(IV, &I, V1State.getConstant()); // If this is: // -> AND/MUL with 0 @@ -1026,12 +967,12 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // X and 0 = 0 // X * 0 = 0 if (NonOverdefVal->getConstant()->isNullValue()) - return markConstant(IV, &I, NonOverdefVal->getConstant()); + return (void)markConstant(IV, &I, NonOverdefVal->getConstant()); } else { // X or -1 = -1 if (ConstantInt *CI = NonOverdefVal->getConstantInt()) if (CI->isMinusOne()) - return markConstant(IV, &I, NonOverdefVal->getConstant()); + return (void)markConstant(IV, &I, NonOverdefVal->getConstant()); } } } @@ -1041,22 +982,36 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // Handle ICmpInst instruction. void SCCPSolver::visitCmpInst(CmpInst &I) { - LatticeVal V1State = getValueState(I.getOperand(0)); - LatticeVal V2State = getValueState(I.getOperand(1)); - LatticeVal &IV = ValueState[&I]; if (IV.isOverdefined()) return; - if (V1State.isConstant() && V2State.isConstant()) { - Constant *C = ConstantExpr::getCompare( - I.getPredicate(), V1State.getConstant(), V2State.getConstant()); + Value *Op1 = I.getOperand(0); + Value *Op2 = I.getOperand(1); + + // For parameters, use ParamState which includes constant range info if + // available. + auto V1Param = ParamState.find(Op1); + ValueLatticeElement V1State = (V1Param != ParamState.end()) + ? V1Param->second + : getValueState(Op1).toValueLattice(); + + auto V2Param = ParamState.find(Op2); + ValueLatticeElement V2State = V2Param != ParamState.end() + ? V2Param->second + : getValueState(Op2).toValueLattice(); + + Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State); + if (C) { if (isa<UndefValue>(C)) return; - return markConstant(IV, &I, C); + LatticeVal CV; + CV.markConstant(C); + mergeInValue(&I, CV); + return; } // If operands are still unknown, wait for it to resolve. - if (!V1State.isOverdefined() && !V2State.isOverdefined()) + if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant()) return; markOverdefined(&I); @@ -1076,7 +1031,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { return; // Operands are not resolved yet. if (State.isOverdefined()) - return markOverdefined(&I); + return (void)markOverdefined(&I); assert(State.isConstant() && "Unknown state!"); Operands.push_back(State.getConstant()); @@ -1114,7 +1069,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) { void SCCPSolver::visitLoadInst(LoadInst &I) { // If this load is of a struct, just mark the result overdefined. if (I.getType()->isStructTy()) - return markOverdefined(&I); + return (void)markOverdefined(&I); LatticeVal PtrVal = getValueState(I.getOperand(0)); if (PtrVal.isUnknown()) return; // The pointer is not resolved yet! @@ -1123,13 +1078,17 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { if (IV.isOverdefined()) return; if (!PtrVal.isConstant() || I.isVolatile()) - return markOverdefined(IV, &I); + return (void)markOverdefined(IV, &I); Constant *Ptr = PtrVal.getConstant(); // load null is undefined. - if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0) - return; + if (isa<ConstantPointerNull>(Ptr)) { + if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace())) + return (void)markOverdefined(IV, &I); + else + return; + } // Transform load (constant global) into the value loaded. if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) { @@ -1148,7 +1107,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) { if (isa<UndefValue>(C)) return; - return markConstant(IV, &I, C); + return (void)markConstant(IV, &I, C); } // Otherwise we cannot say for certain what value this load will produce. @@ -1180,7 +1139,7 @@ CallOverdefined: if (State.isUnknown()) return; // Operands are not resolved yet. if (State.isOverdefined()) - return markOverdefined(I); + return (void)markOverdefined(I); assert(State.isConstant() && "Unknown state!"); Operands.push_back(State.getConstant()); } @@ -1194,12 +1153,12 @@ CallOverdefined: // call -> undef. if (isa<UndefValue>(C)) return; - return markConstant(I, C); + return (void)markConstant(I, C); } } // Otherwise, we don't know anything about this call, mark it overdefined. - return markOverdefined(I); + return (void)markOverdefined(I); } // If this is a local function that doesn't have its address taken, mark its @@ -1227,8 +1186,16 @@ CallOverdefined: } else { // Most other parts of the Solver still only use the simpler value // lattice, so we propagate changes for parameters to both lattices. - getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL); - mergeInValue(&*AI, getValueState(*CAI)); + LatticeVal ConcreteArgument = getValueState(*CAI); + bool ParamChanged = + getParamState(&*AI).mergeIn(ConcreteArgument.toValueLattice(), DL); + bool ValueChanged = mergeInValue(&*AI, ConcreteArgument); + // Add argument to work list, if the state of a parameter changes but + // ValueState does not change (because it is already overdefined there), + // We have to take changes in ParamState into account, as it is used + // when evaluating Cmp instructions. + if (!ValueChanged && ParamChanged) + pushToWorkList(ValueState[&*AI], &*AI); } } } @@ -1262,7 +1229,7 @@ void SCCPSolver::Solve() { while (!OverdefinedInstWorkList.empty()) { Value *I = OverdefinedInstWorkList.pop_back_val(); - DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n'); + LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n'); // "I" got into the work list because it either made the transition from // bottom to constant, or to overdefined. @@ -1280,7 +1247,7 @@ void SCCPSolver::Solve() { while (!InstWorkList.empty()) { Value *I = InstWorkList.pop_back_val(); - DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n'); + LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n'); // "I" got into the work list because it made the transition from undef to // constant. @@ -1300,7 +1267,7 @@ void SCCPSolver::Solve() { BasicBlock *BB = BBWorkList.back(); BBWorkList.pop_back(); - DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n'); + LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n'); // Notify all instructions in this basic block that they are newly // executable. @@ -1521,7 +1488,11 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; case Instruction::ICmp: // X == undef -> undef. Other comparisons get more complicated. - if (cast<ICmpInst>(&I)->isEquality()) + Op0LV = getValueState(I.getOperand(0)); + Op1LV = getValueState(I.getOperand(1)); + + if ((Op0LV.isUnknown() || Op1LV.isUnknown()) && + cast<ICmpInst>(&I)->isEquality()) break; markOverdefined(&I); return true; @@ -1566,11 +1537,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } // Otherwise, it is a branch on a symbolic value which is currently - // considered to be undef. Handle this by forcing the input value to the - // branch to false. - markForcedConstant(BI->getCondition(), - ConstantInt::getFalse(TI->getContext())); - return true; + // considered to be undef. Make sure some edge is executable, so a + // branch on "undef" always flows somewhere. + // FIXME: Distinguish between dead code and an LLVM "undef" value. + BasicBlock *DefaultSuccessor = TI->getSuccessor(1); + if (markEdgeExecutable(&BB, DefaultSuccessor)) + return true; + + continue; } if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) { @@ -1591,11 +1565,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } // Otherwise, it is a branch on a symbolic value which is currently - // considered to be undef. Handle this by forcing the input value to the - // branch to the first successor. - markForcedConstant(IBR->getAddress(), - BlockAddress::get(IBR->getSuccessor(0))); - return true; + // considered to be undef. Make sure some edge is executable, so a + // branch on "undef" always flows somewhere. + // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere: + // we can assume the branch has undefined behavior instead. + BasicBlock *DefaultSuccessor = IBR->getSuccessor(0); + if (markEdgeExecutable(&BB, DefaultSuccessor)) + return true; + + continue; } if (auto *SI = dyn_cast<SwitchInst>(TI)) { @@ -1610,56 +1588,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return true; } - markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue()); - return true; - } - } - - return false; -} - -static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) { - bool Changed = false; - - // Currently we only use range information for integer values. - if (!V->getType()->isIntegerTy()) - return false; - - const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); - if (!IV.isConstantRange()) - return false; + // Otherwise, it is a branch on a symbolic value which is currently + // considered to be undef. Make sure some edge is executable, so a + // branch on "undef" always flows somewhere. + // FIXME: Distinguish between dead code and an LLVM "undef" value. + BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor(); + if (markEdgeExecutable(&BB, DefaultSuccessor)) + return true; - for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) { - const Use &U = *UI++; - auto *Icmp = dyn_cast<ICmpInst>(U.getUser()); - if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent())) continue; - - auto getIcmpLatticeValue = [&](Value *Op) { - if (auto *C = dyn_cast<Constant>(Op)) - return ValueLatticeElement::get(C); - return Solver.getLatticeValueFor(Op); - }; - - ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0)); - ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1)); - - Constant *C = nullptr; - if (A.satisfiesPredicate(Icmp->getPredicate(), B)) - C = ConstantInt::getTrue(Icmp->getType()); - else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B)) - C = ConstantInt::getFalse(Icmp->getType()); - - if (C) { - Icmp->replaceAllUsesWith(C); - DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C - << ", because of range information " << A << " " << B - << "\n"); - Icmp->eraseFromParent(); - Changed = true; } } - return Changed; + + return false; } static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { @@ -1679,26 +1620,18 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { } Const = ConstantStruct::get(ST, ConstVals); } else { - const ValueLatticeElement &IV = Solver.getLatticeValueFor(V); + const LatticeVal &IV = Solver.getLatticeValueFor(V); if (IV.isOverdefined()) return false; - if (IV.isConstantRange()) { - if (IV.getConstantRange().isSingleElement()) - Const = - ConstantInt::get(V->getType(), IV.asConstantInteger().getValue()); - else - return false; - } else - Const = - IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType()); + Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType()); } assert(Const && "Constant is nullptr here!"); // Replacing `musttail` instructions with constant breaks `musttail` invariant // unless the call itself can be removed CallInst *CI = dyn_cast<CallInst>(V); - if (CI && CI->isMustTailCall() && !isInstructionTriviallyDead(CI)) { + if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) { CallSite CS(CI); Function *F = CS.getCalledFunction(); @@ -1706,12 +1639,12 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { if (F) Solver.AddMustTailCallee(F); - DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI - << " as a constant\n"); + LLVM_DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI + << " as a constant\n"); return false; } - DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n'); + LLVM_DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n'); // Replaces all of the uses of a variable with uses of the constant. V->replaceAllUsesWith(Const); @@ -1722,7 +1655,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { // and return true if the function was modified. static bool runSCCP(Function &F, const DataLayout &DL, const TargetLibraryInfo *TLI) { - DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); + LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. @@ -1736,7 +1669,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, bool ResolvedUndefs = true; while (ResolvedUndefs) { Solver.Solve(); - DEBUG(dbgs() << "RESOLVING UNDEFs\n"); + LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n"); ResolvedUndefs = Solver.ResolvedUndefsIn(F); } @@ -1748,7 +1681,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, for (BasicBlock &BB : F) { if (!Solver.isBlockExecutable(&BB)) { - DEBUG(dbgs() << " BasicBlock Dead:" << BB); + LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB); ++NumDeadBlocks; NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB); @@ -1785,6 +1718,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { auto PA = PreservedAnalyses(); PA.preserve<GlobalsAA>(); + PA.preserveSet<CFGAnalyses>(); return PA; } @@ -1807,6 +1741,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.setPreservesCFG(); } // runOnFunction - Run the Sparse Conditional Constant Propagation @@ -1844,15 +1779,15 @@ static void findReturnsToZap(Function &F, // There is a non-removable musttail call site of this function. Zapping // returns is not allowed. if (Solver.isMustTailCallee(&F)) { - DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName() - << " due to present musttail call of it\n"); + LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName() + << " due to present musttail call of it\n"); return; } for (BasicBlock &BB : F) { if (CallInst *CI = BB.getTerminatingMustTailCall()) { - DEBUG(dbgs() << "Can't zap return of the block due to present " - << "musttail call : " << *CI << "\n"); + LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present " + << "musttail call : " << *CI << "\n"); (void)CI; return; } @@ -1863,8 +1798,8 @@ static void findReturnsToZap(Function &F, } } -static bool runIPSCCP(Module &M, const DataLayout &DL, - const TargetLibraryInfo *TLI) { +bool llvm::runIPSCCP(Module &M, const DataLayout &DL, + const TargetLibraryInfo *TLI) { SCCPSolver Solver(DL, TLI); // Loop over all functions, marking arguments to those with their addresses @@ -1904,13 +1839,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // Solve for constants. bool ResolvedUndefs = true; + Solver.Solve(); while (ResolvedUndefs) { - Solver.Solve(); - - DEBUG(dbgs() << "RESOLVING UNDEFS\n"); + LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n"); ResolvedUndefs = false; for (Function &F : M) - ResolvedUndefs |= Solver.ResolvedUndefsIn(F); + if (Solver.ResolvedUndefsIn(F)) { + // We run Solve() after we resolved an undef in a function, because + // we might deduce a fact that eliminates an undef in another function. + Solver.Solve(); + ResolvedUndefs = true; + } } bool MadeChanges = false; @@ -1930,18 +1869,12 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, ++IPNumArgsElimed; continue; } - - if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI)) - ++IPNumRangeInfoUsed; } for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (!Solver.isBlockExecutable(&*BB)) { - DEBUG(dbgs() << " BasicBlock Dead:" << *BB); - + LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB); ++NumDeadBlocks; - NumInstRemoved += - changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false); MadeChanges = true; @@ -1955,7 +1888,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (Inst->getType()->isVoidTy()) continue; if (tryToReplaceWithConstant(Solver, Inst)) { - if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst)) + if (Inst->isSafeToRemove()) Inst->eraseFromParent(); // Hey, we just changed something! MadeChanges = true; @@ -1964,6 +1897,17 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, } } + // Change dead blocks to unreachable. We do it after replacing constants in + // all executable blocks, because changeToUnreachable may remove PHI nodes + // in executable blocks we found values for. The function's entry block is + // not part of BlocksToErase, so we have to handle it separately. + for (BasicBlock *BB : BlocksToErase) + NumInstRemoved += + changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false); + if (!Solver.isBlockExecutable(&F.front())) + NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(), + /*UseLLVMTrap=*/false); + // Now that all instructions in the function are constant folded, erase dead // blocks, because we can now use ConstantFoldTerminator to get rid of // in-edges. @@ -1983,31 +1927,33 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, bool Folded = ConstantFoldTerminator(I->getParent()); if (!Folded) { - // The constant folder may not have been able to fold the terminator - // if this is a branch or switch on undef. Fold it manually as a - // branch to the first successor. -#ifndef NDEBUG - if (auto *BI = dyn_cast<BranchInst>(I)) { - assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) && - "Branch should be foldable!"); - } else if (auto *SI = dyn_cast<SwitchInst>(I)) { - assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold"); + // If the branch can't be folded, we must have forced an edge + // for an indeterminate value. Force the terminator to fold + // to that edge. + Constant *C; + BasicBlock *Dest; + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + Dest = SI->case_begin()->getCaseSuccessor(); + C = SI->case_begin()->getCaseValue(); + } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + Dest = BI->getSuccessor(1); + C = ConstantInt::getFalse(BI->getContext()); + } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) { + Dest = IBR->getSuccessor(0); + C = BlockAddress::get(IBR->getSuccessor(0)); } else { - llvm_unreachable("Didn't fold away reference to block!"); + llvm_unreachable("Unexpected terminator instruction"); } -#endif - - // Make this an uncond branch to the first successor. - TerminatorInst *TI = I->getParent()->getTerminator(); - BranchInst::Create(TI->getSuccessor(0), TI); + assert(Solver.isEdgeFeasible(I->getParent(), Dest) && + "Didn't find feasible edge?"); + (void)Dest; - // Remove entries in successor phi nodes to remove edges. - for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) - TI->getSuccessor(i)->removePredecessor(TI->getParent()); - - // Remove the old terminator. - TI->eraseFromParent(); + I->setOperand(0, C); + Folded = ConstantFoldTerminator(I->getParent()); } + assert(Folded && + "Expect TermInst on constantint or blockaddress to be folded"); + (void) Folded; } // Finally, delete the basic block. @@ -2058,7 +2004,8 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, GlobalVariable *GV = I->first; assert(!I->second.isOverdefined() && "Overdefined values should have been taken out of the map!"); - DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); + LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName() + << "' is constant!\n"); while (!GV->use_empty()) { StoreInst *SI = cast<StoreInst>(GV->user_back()); SI->eraseFromParent(); @@ -2069,55 +2016,3 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, return MadeChanges; } - -PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { - const DataLayout &DL = M.getDataLayout(); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); - if (!runIPSCCP(M, DL, &TLI)) - return PreservedAnalyses::all(); - return PreservedAnalyses::none(); -} - -namespace { - -//===--------------------------------------------------------------------===// -// -/// IPSCCP Class - This class implements interprocedural Sparse Conditional -/// Constant Propagation. -/// -class IPSCCPLegacyPass : public ModulePass { -public: - static char ID; - - IPSCCPLegacyPass() : ModulePass(ID) { - initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override { - if (skipModule(M)) - return false; - const DataLayout &DL = M.getDataLayout(); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - return runIPSCCP(M, DL, TLI); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfoWrapperPass>(); - } -}; - -} // end anonymous namespace - -char IPSCCPLegacyPass::ID = 0; - -INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp", - "Interprocedural Sparse Conditional Constant Propagation", - false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp", - "Interprocedural Sparse Conditional Constant Propagation", - false, false) - -// createIPSCCPPass - This is the public interface to this file. -ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index bfe3754f0769..6c3f012c6280 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -42,6 +42,8 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantFolder.h" @@ -79,7 +81,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> @@ -124,14 +125,9 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), cl::Hidden); -/// Hidden option to allow more aggressive splitting. -static cl::opt<bool> -SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices", - cl::init(false), cl::Hidden); - namespace { -/// \brief A custom IRBuilder inserter which prefixes all names, but only in +/// A custom IRBuilder inserter which prefixes all names, but only in /// Assert builds. class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter { std::string Prefix; @@ -151,23 +147,23 @@ protected: } }; -/// \brief Provide a type for IRBuilder that drops names in release builds. +/// Provide a type for IRBuilder that drops names in release builds. using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; -/// \brief A used slice of an alloca. +/// A used slice of an alloca. /// /// This structure represents a slice of an alloca used by some instruction. It /// stores both the begin and end offsets of this use, a pointer to the use /// itself, and a flag indicating whether we can classify the use as splittable /// or not when forming partitions of the alloca. class Slice { - /// \brief The beginning offset of the range. + /// The beginning offset of the range. uint64_t BeginOffset = 0; - /// \brief The ending offset, not included in the range. + /// The ending offset, not included in the range. uint64_t EndOffset = 0; - /// \brief Storage for both the use of this slice and whether it can be + /// Storage for both the use of this slice and whether it can be /// split. PointerIntPair<Use *, 1, bool> UseAndIsSplittable; @@ -189,7 +185,7 @@ public: bool isDead() const { return getUse() == nullptr; } void kill() { UseAndIsSplittable.setPointer(nullptr); } - /// \brief Support for ordering ranges. + /// Support for ordering ranges. /// /// This provides an ordering over ranges such that start offsets are /// always increasing, and within equal start offsets, the end offsets are @@ -207,7 +203,7 @@ public: return false; } - /// \brief Support comparison with a single offset to allow binary searches. + /// Support comparison with a single offset to allow binary searches. friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, uint64_t RHSOffset) { return LHS.beginOffset() < RHSOffset; @@ -233,7 +229,7 @@ template <> struct isPodLike<Slice> { static const bool value = true; }; } // end namespace llvm -/// \brief Representation of the alloca slices. +/// Representation of the alloca slices. /// /// This class represents the slices of an alloca which are formed by its /// various uses. If a pointer escapes, we can't fully build a representation @@ -242,16 +238,16 @@ template <> struct isPodLike<Slice> { static const bool value = true; }; /// starting at a particular offset before splittable slices. class llvm::sroa::AllocaSlices { public: - /// \brief Construct the slices of a particular alloca. + /// Construct the slices of a particular alloca. AllocaSlices(const DataLayout &DL, AllocaInst &AI); - /// \brief Test whether a pointer to the allocation escapes our analysis. + /// Test whether a pointer to the allocation escapes our analysis. /// /// If this is true, the slices are never fully built and should be /// ignored. bool isEscaped() const { return PointerEscapingInstr; } - /// \brief Support for iterating over the slices. + /// Support for iterating over the slices. /// @{ using iterator = SmallVectorImpl<Slice>::iterator; using range = iterator_range<iterator>; @@ -266,10 +262,10 @@ public: const_iterator end() const { return Slices.end(); } /// @} - /// \brief Erase a range of slices. + /// Erase a range of slices. void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } - /// \brief Insert new slices for this alloca. + /// Insert new slices for this alloca. /// /// This moves the slices into the alloca's slices collection, and re-sorts /// everything so that the usual ordering properties of the alloca's slices @@ -278,7 +274,7 @@ public: int OldSize = Slices.size(); Slices.append(NewSlices.begin(), NewSlices.end()); auto SliceI = Slices.begin() + OldSize; - std::sort(SliceI, Slices.end()); + llvm::sort(SliceI, Slices.end()); std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } @@ -287,10 +283,10 @@ public: class partition_iterator; iterator_range<partition_iterator> partitions(); - /// \brief Access the dead users for this alloca. + /// Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } - /// \brief Access the dead operands referring to this alloca. + /// Access the dead operands referring to this alloca. /// /// These are operands which have cannot actually be used to refer to the /// alloca as they are outside its range and the user doesn't correct for @@ -316,11 +312,11 @@ private: friend class AllocaSlices::SliceBuilder; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// \brief Handle to alloca instruction to simplify method interfaces. + /// Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; #endif - /// \brief The instruction responsible for this alloca not having a known set + /// The instruction responsible for this alloca not having a known set /// of slices. /// /// When an instruction (potentially) escapes the pointer to the alloca, we @@ -328,7 +324,7 @@ private: /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; - /// \brief The slices of the alloca. + /// The slices of the alloca. /// /// We store a vector of the slices formed by uses of the alloca here. This /// vector is sorted by increasing begin offset, and then the unsplittable @@ -336,7 +332,7 @@ private: /// details. SmallVector<Slice, 8> Slices; - /// \brief Instructions which will become dead if we rewrite the alloca. + /// Instructions which will become dead if we rewrite the alloca. /// /// Note that these are not separated by slice. This is because we expect an /// alloca to be completely rewritten or not rewritten at all. If rewritten, @@ -344,7 +340,7 @@ private: /// they come from outside of the allocated space. SmallVector<Instruction *, 8> DeadUsers; - /// \brief Operands which will become dead if we rewrite the alloca. + /// Operands which will become dead if we rewrite the alloca. /// /// These are operands that in their particular use can be replaced with /// undef when we rewrite the alloca. These show up in out-of-bounds inputs @@ -355,7 +351,7 @@ private: SmallVector<Use *, 8> DeadOperands; }; -/// \brief A partition of the slices. +/// A partition of the slices. /// /// An ephemeral representation for a range of slices which can be viewed as /// a partition of the alloca. This range represents a span of the alloca's @@ -371,32 +367,32 @@ private: using iterator = AllocaSlices::iterator; - /// \brief The beginning and ending offsets of the alloca for this + /// The beginning and ending offsets of the alloca for this /// partition. uint64_t BeginOffset, EndOffset; - /// \brief The start and end iterators of this partition. + /// The start and end iterators of this partition. iterator SI, SJ; - /// \brief A collection of split slice tails overlapping the partition. + /// A collection of split slice tails overlapping the partition. SmallVector<Slice *, 4> SplitTails; - /// \brief Raw constructor builds an empty partition starting and ending at + /// Raw constructor builds an empty partition starting and ending at /// the given iterator. Partition(iterator SI) : SI(SI), SJ(SI) {} public: - /// \brief The start offset of this partition. + /// The start offset of this partition. /// /// All of the contained slices start at or after this offset. uint64_t beginOffset() const { return BeginOffset; } - /// \brief The end offset of this partition. + /// The end offset of this partition. /// /// All of the contained slices end at or before this offset. uint64_t endOffset() const { return EndOffset; } - /// \brief The size of the partition. + /// The size of the partition. /// /// Note that this can never be zero. uint64_t size() const { @@ -404,7 +400,7 @@ public: return EndOffset - BeginOffset; } - /// \brief Test whether this partition contains no slices, and merely spans + /// Test whether this partition contains no slices, and merely spans /// a region occupied by split slices. bool empty() const { return SI == SJ; } @@ -421,7 +417,7 @@ public: iterator end() const { return SJ; } /// @} - /// \brief Get the sequence of split slice tails. + /// Get the sequence of split slice tails. /// /// These tails are of slices which start before this partition but are /// split and overlap into the partition. We accumulate these while forming @@ -429,7 +425,7 @@ public: ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } }; -/// \brief An iterator over partitions of the alloca's slices. +/// An iterator over partitions of the alloca's slices. /// /// This iterator implements the core algorithm for partitioning the alloca's /// slices. It is a forward iterator as we don't support backtracking for @@ -443,18 +439,18 @@ class AllocaSlices::partition_iterator Partition> { friend class AllocaSlices; - /// \brief Most of the state for walking the partitions is held in a class + /// Most of the state for walking the partitions is held in a class /// with a nice interface for examining them. Partition P; - /// \brief We need to keep the end of the slices to know when to stop. + /// We need to keep the end of the slices to know when to stop. AllocaSlices::iterator SE; - /// \brief We also need to keep track of the maximum split end offset seen. + /// We also need to keep track of the maximum split end offset seen. /// FIXME: Do we really? uint64_t MaxSplitSliceEndOffset = 0; - /// \brief Sets the partition to be empty at given iterator, and sets the + /// Sets the partition to be empty at given iterator, and sets the /// end iterator. partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) : P(SI), SE(SE) { @@ -464,7 +460,7 @@ class AllocaSlices::partition_iterator advance(); } - /// \brief Advance the iterator to the next partition. + /// Advance the iterator to the next partition. /// /// Requires that the iterator not be at the end of the slices. void advance() { @@ -619,7 +615,7 @@ public: Partition &operator*() { return P; } }; -/// \brief A forward range over the partitions of the alloca's slices. +/// A forward range over the partitions of the alloca's slices. /// /// This accesses an iterator range over the partitions of the alloca's /// slices. It computes these partitions on the fly based on the overlapping @@ -643,7 +639,7 @@ static Value *foldSelectInst(SelectInst &SI) { return nullptr; } -/// \brief A helper that folds a PHI node or a select. +/// A helper that folds a PHI node or a select. static Value *foldPHINodeOrSelectInst(Instruction &I) { if (PHINode *PN = dyn_cast<PHINode>(&I)) { // If PN merges together the same value, return that value. @@ -652,7 +648,7 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) { return foldSelectInst(cast<SelectInst>(I)); } -/// \brief Builder for the alloca slices. +/// Builder for the alloca slices. /// /// This class builds a set of alloca slices by recursively visiting the uses /// of an alloca and making a slice for each load and store at each offset. @@ -668,7 +664,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; - /// \brief Set to de-duplicate dead instructions found in the use walk. + /// Set to de-duplicate dead instructions found in the use walk. SmallPtrSet<Instruction *, 4> VisitedDeadInsts; public: @@ -687,11 +683,12 @@ private: // Completely skip uses which have a zero size or start either before or // past the end of the allocation. if (Size == 0 || Offset.uge(AllocSize)) { - DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset - << " which has zero size or starts outside of the " - << AllocSize << " byte alloca:\n" - << " alloca: " << AS.AI << "\n" - << " use: " << I << "\n"); + LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" + << Offset + << " which has zero size or starts outside of the " + << AllocSize << " byte alloca:\n" + << " alloca: " << AS.AI << "\n" + << " use: " << I << "\n"); return markAsDead(I); } @@ -706,10 +703,11 @@ private: // them, and so have to record at least the information here. assert(AllocSize >= BeginOffset); // Established above. if (Size > AllocSize - BeginOffset) { - DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset - << " to remain within the " << AllocSize << " byte alloca:\n" - << " alloca: " << AS.AI << "\n" - << " use: " << I << "\n"); + LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" + << Offset << " to remain within the " << AllocSize + << " byte alloca:\n" + << " alloca: " << AS.AI << "\n" + << " use: " << I << "\n"); EndOffset = AllocSize; } @@ -802,18 +800,18 @@ private: uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); // If this memory access can be shown to *statically* extend outside the - // bounds of of the allocation, it's behavior is undefined, so simply + // bounds of the allocation, it's behavior is undefined, so simply // ignore it. Note that this is more strict than the generic clamping // behavior of insertUse. We also try to handle cases which might run the // risk of overflow. // FIXME: We should instead consider the pointer to have escaped if this // function is being instrumented for addressing bugs or race conditions. if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { - DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset - << " which extends past the end of the " << AllocSize - << " byte alloca:\n" - << " alloca: " << AS.AI << "\n" - << " use: " << SI << "\n"); + LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" + << Offset << " which extends past the end of the " + << AllocSize << " byte alloca:\n" + << " alloca: " << AS.AI << "\n" + << " use: " << SI << "\n"); return markAsDead(SI); } @@ -1027,7 +1025,7 @@ private: void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } - /// \brief Disable SROA entirely if there are unhandled users of the alloca. + /// Disable SROA entirely if there are unhandled users of the alloca. void visitInstruction(Instruction &I) { PI.setAborted(&I); } }; @@ -1062,7 +1060,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. - std::sort(Slices.begin(), Slices.end()); + llvm::sort(Slices.begin(), Slices.end()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1240,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { } static void speculatePHINodeLoads(PHINode &PN) { - DEBUG(dbgs() << " original: " << PN << "\n"); + LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); IRBuilderTy PHIBuilder(&PN); @@ -1263,10 +1261,21 @@ static void speculatePHINodeLoads(PHINode &PN) { } // Inject loads into all of the pred blocks. + DenseMap<BasicBlock*, Value*> InjectedLoads; for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { BasicBlock *Pred = PN.getIncomingBlock(Idx); - TerminatorInst *TI = Pred->getTerminator(); Value *InVal = PN.getIncomingValue(Idx); + + // A PHI node is allowed to have multiple (duplicated) entries for the same + // basic block, as long as the value is the same. So if we already injected + // a load in the predecessor, then we should reuse the same load for all + // duplicated entries. + if (Value* V = InjectedLoads.lookup(Pred)) { + NewPN->addIncoming(V, Pred); + continue; + } + + TerminatorInst *TI = Pred->getTerminator(); IRBuilderTy PredBuilder(TI); LoadInst *Load = PredBuilder.CreateLoad( @@ -1276,9 +1285,10 @@ static void speculatePHINodeLoads(PHINode &PN) { if (AATags) Load->setAAMetadata(AATags); NewPN->addIncoming(Load, Pred); + InjectedLoads[Pred] = Load; } - DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); PN.eraseFromParent(); } @@ -1318,7 +1328,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { } static void speculateSelectInstLoads(SelectInst &SI) { - DEBUG(dbgs() << " original: " << SI << "\n"); + LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); IRBuilderTy IRB(&SI); Value *TV = SI.getTrueValue(); @@ -1349,14 +1359,14 @@ static void speculateSelectInstLoads(SelectInst &SI) { Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, LI->getName() + ".sroa.speculated"); - DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n"); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } SI.eraseFromParent(); } -/// \brief Build a GEP out of a base pointer and indices. +/// Build a GEP out of a base pointer and indices. /// /// This will return the BasePtr if that is valid, or build a new GEP /// instruction using the IRBuilder if GEP-ing is needed. @@ -1374,7 +1384,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, NamePrefix + "sroa_idx"); } -/// \brief Get a natural GEP off of the BasePtr walking through Ty toward +/// Get a natural GEP off of the BasePtr walking through Ty toward /// TargetTy without changing the offset of the pointer. /// /// This routine assumes we've already established a properly offset GEP with @@ -1423,7 +1433,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, return buildGEP(IRB, BasePtr, Indices, NamePrefix); } -/// \brief Recursively compute indices for a natural GEP. +/// Recursively compute indices for a natural GEP. /// /// This is the recursive step for getNaturalGEPWithOffset that walks down the /// element types adding appropriate indices for the GEP. @@ -1491,7 +1501,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Indices, NamePrefix); } -/// \brief Get a natural GEP from a base pointer to a particular offset and +/// Get a natural GEP from a base pointer to a particular offset and /// resulting in a particular type. /// /// The goal is to produce a "natural" looking GEP that works with the existing @@ -1526,7 +1536,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Indices, NamePrefix); } -/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the +/// Compute an adjusted pointer from Ptr by Offset bytes where the /// resulting pointer has PointerTy. /// /// This tries very hard to compute a "natural" GEP which arrives at the offset @@ -1635,7 +1645,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } -/// \brief Compute the adjusted alignment for a load or store from an offset. +/// Compute the adjusted alignment for a load or store from an offset. static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, const DataLayout &DL) { unsigned Alignment; @@ -1656,7 +1666,7 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, return MinAlign(Alignment, Offset); } -/// \brief Test whether we can convert a value from the old to the new type. +/// Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to /// ensure that we only try to convert viable values. The strategy is that we @@ -1707,7 +1717,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return true; } -/// \brief Generic routine to convert an SSA value to a value of a different +/// Generic routine to convert an SSA value to a value of a different /// type. /// /// This will try various different casting techniques, such as bitcasts, @@ -1759,7 +1769,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, return IRB.CreateBitCast(V, NewTy); } -/// \brief Test whether the given slice use can be promoted to a vector. +/// Test whether the given slice use can be promoted to a vector. /// /// This function is called to test each entry in a partition which is slated /// for a single slice. @@ -1830,7 +1840,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, return true; } -/// \brief Test whether the given alloca partitioning and range of slices can be +/// Test whether the given alloca partitioning and range of slices can be /// promoted to a vector. /// /// This is a quick test to check whether we can rewrite a particular alloca @@ -1896,7 +1906,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { "All non-integer types eliminated!"); return RHSTy->getNumElements() < LHSTy->getNumElements(); }; - std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes); + llvm::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes); CandidateTys.erase( std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), CandidateTys.end()); @@ -1943,7 +1953,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { return nullptr; } -/// \brief Test whether a slice of an alloca is valid for integer widening. +/// Test whether a slice of an alloca is valid for integer widening. /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. @@ -1970,6 +1980,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S, // We can't handle loads that extend past the allocated memory. if (DL.getTypeStoreSize(LI->getType()) > Size) return false; + // So far, AllocaSliceRewriter does not support widening split slice tails + // in rewriteIntegerLoad. + if (S.beginOffset() < AllocBeginOffset) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -1991,6 +2005,10 @@ static bool isIntegerWideningViableForSlice(const Slice &S, // We can't handle stores that extend past the allocated memory. if (DL.getTypeStoreSize(ValueTy) > Size) return false; + // So far, AllocaSliceRewriter does not support widening split slice tails + // in rewriteIntegerStore. + if (S.beginOffset() < AllocBeginOffset) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2021,7 +2039,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, return true; } -/// \brief Test whether the given alloca partition's integer operations can be +/// Test whether the given alloca partition's integer operations can be /// widened to promotable ones. /// /// This is a quick test to check whether we can rewrite the integer loads and @@ -2072,7 +2090,7 @@ static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name) { - DEBUG(dbgs() << " start: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); IntegerType *IntTy = cast<IntegerType>(V->getType()); assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element extends past full value"); @@ -2081,13 +2099,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); - DEBUG(dbgs() << " shifted: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); } assert(Ty->getBitWidth() <= IntTy->getBitWidth() && "Cannot extract to a larger integer!"); if (Ty != IntTy) { V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); - DEBUG(dbgs() << " trunced: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n"); } return V; } @@ -2098,10 +2116,10 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, IntegerType *Ty = cast<IntegerType>(V->getType()); assert(Ty->getBitWidth() <= IntTy->getBitWidth() && "Cannot insert a larger integer!"); - DEBUG(dbgs() << " start: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); if (Ty != IntTy) { V = IRB.CreateZExt(V, IntTy, Name + ".ext"); - DEBUG(dbgs() << " extended: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " extended: " << *V << "\n"); } assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); @@ -2110,15 +2128,15 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateShl(V, ShAmt, Name + ".shift"); - DEBUG(dbgs() << " shifted: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); } if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); - DEBUG(dbgs() << " masked: " << *Old << "\n"); + LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n"); V = IRB.CreateOr(Old, V, Name + ".insert"); - DEBUG(dbgs() << " inserted: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n"); } return V; } @@ -2135,7 +2153,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, if (NumElements == 1) { V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), Name + ".extract"); - DEBUG(dbgs() << " extract: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " extract: " << *V << "\n"); return V; } @@ -2145,7 +2163,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, Mask.push_back(IRB.getInt32(i)); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), ConstantVector::get(Mask), Name + ".extract"); - DEBUG(dbgs() << " shuffle: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; } @@ -2159,7 +2177,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // Single element to insert. V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), Name + ".insert"); - DEBUG(dbgs() << " insert: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " insert: " << *V << "\n"); return V; } @@ -2184,7 +2202,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, Mask.push_back(UndefValue::get(IRB.getInt32Ty())); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), ConstantVector::get(Mask), Name + ".expand"); - DEBUG(dbgs() << " shuffle: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); Mask.clear(); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) @@ -2192,11 +2210,11 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend"); - DEBUG(dbgs() << " blend: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " blend: " << *V << "\n"); return V; } -/// \brief Visitor to rewrite instructions using p particular slice of an alloca +/// Visitor to rewrite instructions using p particular slice of an alloca /// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition @@ -2295,9 +2313,9 @@ public: IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; - DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); - DEBUG(AS.printSlice(dbgs(), I, "")); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); + LLVM_DEBUG(AS.printSlice(dbgs(), I, "")); + LLVM_DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -2327,7 +2345,7 @@ private: // Every instruction which can end up as a user must have a rewrite rule. bool visitInstruction(Instruction &I) { - DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); + LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); llvm_unreachable("No rewrite rule for this instruction!"); } @@ -2369,7 +2387,7 @@ private: ); } - /// \brief Compute suitable alignment to access this slice of the *new* + /// Compute suitable alignment to access this slice of the *new* /// alloca. /// /// You can optionally pass a type to this routine and if that type's ABI @@ -2431,10 +2449,13 @@ private: } bool visitLoadInst(LoadInst &LI) { - DEBUG(dbgs() << " original: " << LI << "\n"); + LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); + AAMDNodes AATags; + LI.getAAMetadata(AATags); + unsigned AS = LI.getPointerAddressSpace(); Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) @@ -2453,6 +2474,8 @@ private: TargetTy->isIntegerTy()))) { LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), LI.getName()); + if (AATags) + NewLI->setAAMetadata(AATags); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); @@ -2488,6 +2511,8 @@ private: LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(TargetTy), LI.isVolatile(), LI.getName()); + if (AATags) + NewLI->setAAMetadata(AATags); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); @@ -2524,11 +2549,12 @@ private: Pass.DeadInsts.insert(&LI); deleteIfTriviallyDead(OldOp); - DEBUG(dbgs() << " to: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *V << "\n"); return !LI.isVolatile() && !IsPtrAdjusted; } - bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { + bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, + AAMDNodes AATags) { if (V->getType() != VecTy) { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); @@ -2546,14 +2572,15 @@ private: V = insertVector(IRB, Old, V, BeginIndex, "vec"); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + if (AATags) + Store->setAAMetadata(AATags); Pass.DeadInsts.insert(&SI); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); return true; } - bool rewriteIntegerStore(Value *V, StoreInst &SI) { + bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { @@ -2567,16 +2594,21 @@ private: V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Store->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access); + if (AATags) + Store->setAAMetadata(AATags); Pass.DeadInsts.insert(&SI); - DEBUG(dbgs() << " to: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); return true; } bool visitStoreInst(StoreInst &SI) { - DEBUG(dbgs() << " original: " << SI << "\n"); + LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); Value *OldOp = SI.getOperand(1); assert(OldOp == OldPtr); + AAMDNodes AATags; + SI.getAAMetadata(AATags); + Value *V = SI.getValueOperand(); // Strip all inbounds GEPs and pointer casts to try to dig out any root @@ -2598,9 +2630,9 @@ private: } if (VecTy) - return rewriteVectorizedStoreInst(V, SI, OldOp); + return rewriteVectorizedStoreInst(V, SI, OldOp, AATags); if (IntTy && V->getType()->isIntegerTy()) - return rewriteIntegerStore(V, SI); + return rewriteIntegerStore(V, SI, AATags); const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize; StoreInst *NewSI; @@ -2631,16 +2663,18 @@ private: SI.isVolatile()); } NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access); + if (AATags) + NewSI->setAAMetadata(AATags); if (SI.isVolatile()) NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); Pass.DeadInsts.insert(&SI); deleteIfTriviallyDead(OldOp); - DEBUG(dbgs() << " to: " << *NewSI << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n"); return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); } - /// \brief Compute an integer value from splatting an i8 across the given + /// Compute an integer value from splatting an i8 across the given /// number of bytes. /// /// Note that this routine assumes an i8 is a byte. If that isn't true, don't @@ -2667,25 +2701,27 @@ private: return V; } - /// \brief Compute a vector splat for a given element value. + /// Compute a vector splat for a given element value. Value *getVectorSplat(Value *V, unsigned NumElements) { V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); - DEBUG(dbgs() << " splat: " << *V << "\n"); + LLVM_DEBUG(dbgs() << " splat: " << *V << "\n"); return V; } bool visitMemSetInst(MemSetInst &II) { - DEBUG(dbgs() << " original: " << II << "\n"); + LLVM_DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getRawDest() == OldPtr); + AAMDNodes AATags; + II.getAAMetadata(AATags); + // If the memset has a variable size, it cannot be split, just adjust the // pointer to the new alloca. if (!isa<Constant>(II.getLength())) { assert(!IsSplit); assert(NewBeginOffset == BeginOffset); II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); - Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); + II.setDestAlignment(getSliceAlign()); deleteIfTriviallyDead(OldPtr); return false; @@ -2710,8 +2746,9 @@ private: CallInst *New = IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, getSliceAlign(), II.isVolatile()); - (void)New; - DEBUG(dbgs() << " to: " << *New << "\n"); + if (AATags) + New->setAAMetadata(AATags); + LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return false; } @@ -2773,10 +2810,11 @@ private: V = convertValue(DL, IRB, V, AllocaTy); } - Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), - II.isVolatile()); - (void)New; - DEBUG(dbgs() << " to: " << *New << "\n"); + StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + II.isVolatile()); + if (AATags) + New->setAAMetadata(AATags); + LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return !II.isVolatile(); } @@ -2784,7 +2822,10 @@ private: // Rewriting of memory transfer instructions can be a bit tricky. We break // them into two categories: split intrinsics and unsplit intrinsics. - DEBUG(dbgs() << " original: " << II << "\n"); + LLVM_DEBUG(dbgs() << " original: " << II << "\n"); + + AAMDNodes AATags; + II.getAAMetadata(AATags); bool IsDest = &II.getRawDestUse() == OldUse; assert((IsDest && II.getRawDest() == OldPtr) || @@ -2801,18 +2842,16 @@ private: // update both source and dest of a single call. if (!IsSplittable) { Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); - if (IsDest) + if (IsDest) { II.setDest(AdjustedPtr); - else + II.setDestAlignment(SliceAlign); + } + else { II.setSource(AdjustedPtr); - - if (II.getAlignment() > SliceAlign) { - Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment( - ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); + II.setSourceAlignment(SliceAlign); } - DEBUG(dbgs() << " to: " << II << "\n"); + LLVM_DEBUG(dbgs() << " to: " << II << "\n"); deleteIfTriviallyDead(OldPtr); return false; } @@ -2862,8 +2901,10 @@ private: // Compute the relative offset for the other pointer within the transfer. unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); - unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, - OtherOffset.zextOrTrunc(64).getZExtValue()); + unsigned OtherAlign = + IsDest ? II.getSourceAlignment() : II.getDestAlignment(); + OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1, + OtherOffset.zextOrTrunc(64).getZExtValue()); if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce @@ -2875,11 +2916,25 @@ private: Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); - CallInst *New = IRB.CreateMemCpy( - IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, - MinAlign(SliceAlign, OtherAlign), II.isVolatile()); - (void)New; - DEBUG(dbgs() << " to: " << *New << "\n"); + Value *DestPtr, *SrcPtr; + unsigned DestAlign, SrcAlign; + // Note: IsDest is true iff we're copying into the new alloca slice + if (IsDest) { + DestPtr = OurPtr; + DestAlign = SliceAlign; + SrcPtr = OtherPtr; + SrcAlign = OtherAlign; + } else { + DestPtr = OtherPtr; + DestAlign = OtherAlign; + SrcPtr = OurPtr; + SrcAlign = SliceAlign; + } + CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign, + Size, II.isVolatile()); + if (AATags) + New->setAAMetadata(AATags); + LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return false; } @@ -2927,8 +2982,11 @@ private: uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = - IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); + LoadInst *Load = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), + "copyload"); + if (AATags) + Load->setAAMetadata(AATags); + Src = Load; } if (VecTy && !IsWholeAlloca && IsDest) { @@ -2946,15 +3004,16 @@ private: StoreInst *Store = cast<StoreInst>( IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); + if (AATags) + Store->setAAMetadata(AATags); + LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); return !II.isVolatile(); } bool visitIntrinsicInst(IntrinsicInst &II) { assert(II.getIntrinsicID() == Intrinsic::lifetime_start || II.getIntrinsicID() == Intrinsic::lifetime_end); - DEBUG(dbgs() << " original: " << II << "\n"); + LLVM_DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); // Record this instruction for deletion. @@ -2982,13 +3041,13 @@ private: New = IRB.CreateLifetimeEnd(Ptr, Size); (void)New; - DEBUG(dbgs() << " to: " << *New << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); return true; } bool visitPHINode(PHINode &PN) { - DEBUG(dbgs() << " original: " << PN << "\n"); + LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); @@ -3007,7 +3066,7 @@ private: // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); - DEBUG(dbgs() << " to: " << PN << "\n"); + LLVM_DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); // PHIs can't be promoted on their own, but often can be speculated. We @@ -3018,7 +3077,7 @@ private: } bool visitSelectInst(SelectInst &SI) { - DEBUG(dbgs() << " original: " << SI << "\n"); + LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && "Pointer isn't an operand!"); assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); @@ -3031,7 +3090,7 @@ private: if (SI.getOperand(2) == OldPtr) SI.setOperand(2, NewPtr); - DEBUG(dbgs() << " to: " << SI << "\n"); + LLVM_DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); // Selects can't be promoted on their own, but often can be speculated. We @@ -3044,7 +3103,7 @@ private: namespace { -/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// Visitor to rewrite aggregate loads and stores as scalar. /// /// This pass aggressively rewrites all aggregate loads and stores on /// a particular pointer (or any pointer derived from it which we can identify) @@ -3067,7 +3126,7 @@ public: /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { - DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); enqueueUsers(I); bool Changed = false; while (!Queue.empty()) { @@ -3089,7 +3148,7 @@ private: // Conservative default is to not rewrite anything. bool visitInstruction(Instruction &I) { return false; } - /// \brief Generic recursive split emission class. + /// Generic recursive split emission class. template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. @@ -3113,7 +3172,7 @@ private: : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} public: - /// \brief Generic recursive split emission routine. + /// Generic recursive split emission routine. /// /// This method recursively splits an aggregate op (load or store) into /// scalar or vector ops. It splits recursively until it hits a single value @@ -3165,8 +3224,10 @@ private: }; struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { - LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + AAMDNodes AATags; + + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. @@ -3175,9 +3236,11 @@ private: // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); - Value *Load = IRB.CreateLoad(GEP, Name + ".load"); + LoadInst *Load = IRB.CreateLoad(GEP, Name + ".load"); + if (AATags) + Load->setAAMetadata(AATags); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); - DEBUG(dbgs() << " to: " << *Load << "\n"); + LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); } }; @@ -3187,8 +3250,10 @@ private: return false; // We have an aggregate being loaded, split it apart. - DEBUG(dbgs() << " original: " << LI << "\n"); - LoadOpSplitter Splitter(&LI, *U); + LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); + AAMDNodes AATags; + LI.getAAMetadata(AATags); + LoadOpSplitter Splitter(&LI, *U, AATags); Value *V = UndefValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); LI.replaceAllUsesWith(V); @@ -3197,8 +3262,9 @@ private: } struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { - StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, AAMDNodes AATags) + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr), AATags(AATags) {} + AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. @@ -3212,9 +3278,10 @@ private: IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); Value *InBoundsGEP = IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); - Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); + StoreInst *Store = IRB.CreateStore(ExtractValue, InBoundsGEP); + if (AATags) + Store->setAAMetadata(AATags); + LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); } }; @@ -3226,8 +3293,10 @@ private: return false; // We have an aggregate being stored, split it apart. - DEBUG(dbgs() << " original: " << SI << "\n"); - StoreOpSplitter Splitter(&SI, *U); + LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); + AAMDNodes AATags; + SI.getAAMetadata(AATags); + StoreOpSplitter Splitter(&SI, *U, AATags); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); SI.eraseFromParent(); return true; @@ -3256,7 +3325,7 @@ private: } // end anonymous namespace -/// \brief Strip aggregate type wrapping. +/// Strip aggregate type wrapping. /// /// This removes no-op aggregate types wrapping an underlying type. It will /// strip as many layers of types as it can without changing either the type @@ -3286,7 +3355,7 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { return stripAggregateTypeWrapping(DL, InnerTy); } -/// \brief Try to find a partition of the aggregate type passed in for a given +/// Try to find a partition of the aggregate type passed in for a given /// offset and size. /// /// This recurses through the aggregate type and tries to compute a subtype @@ -3392,7 +3461,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, return SubTy; } -/// \brief Pre-split loads and stores to simplify rewriting. +/// Pre-split loads and stores to simplify rewriting. /// /// We want to break up the splittable load+store pairs as much as /// possible. This is important to do as a preprocessing step, as once we @@ -3423,7 +3492,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, /// /// \returns true if any changes are made. bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { - DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n"); // Track the loads and stores which are candidates for pre-splitting here, in // the order they first appear during the partition scan. These give stable @@ -3455,7 +3524,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // maybe it would make it more principled? SmallPtrSet<LoadInst *, 8> UnsplittableLoads; - DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n"); for (auto &P : AS.partitions()) { for (Slice &S : P) { Instruction *I = cast<Instruction>(S.getUse()->getUser()); @@ -3510,7 +3579,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // Record the initial split. - DEBUG(dbgs() << " Candidate: " << *I << "\n"); + LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n"); auto &Offsets = SplitOffsetsMap[I]; assert(Offsets.Splits.empty() && "Should not have splits the first time we see an instruction!"); @@ -3570,10 +3639,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (LoadOffsets.Splits == StoreOffsets.Splits) return false; - DEBUG(dbgs() - << " Mismatched splits for load and store:\n" - << " " << *LI << "\n" - << " " << *SI << "\n"); + LLVM_DEBUG( + dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); // We've found a store and load that we need to split // with mismatched relative splits. Just give up on them @@ -3646,7 +3716,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); IRB.SetInsertPoint(LI); - DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); int Idx = 0, Size = Offsets.Splits.size(); @@ -3656,7 +3726,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { auto *PartPtrTy = PartTy->getPointerTo(AS); LoadInst *PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, BasePtr, - APInt(DL.getPointerSizeInBits(AS), PartOffset), + APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); @@ -3671,9 +3741,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), /*IsSplittable*/ false)); - DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() - << ", " << NewSlices.back().endOffset() << "): " << *PLoad - << "\n"); + LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() + << "): " << *PLoad << "\n"); // See if we've handled all the splits. if (Idx >= Size) @@ -3693,14 +3763,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { StoreInst *SI = cast<StoreInst>(LU); if (!Stores.empty() && SplitOffsetsMap.count(SI)) { DeferredStores = true; - DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI + << "\n"); continue; } Value *StoreBasePtr = SI->getPointerOperand(); IRB.SetInsertPoint(SI); - DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { LoadInst *PLoad = SplitLoads[Idx]; @@ -3712,11 +3783,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, - APInt(DL.getPointerSizeInBits(AS), PartOffset), + APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access); - DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); } // We want to immediately iterate on any allocas impacted by splitting @@ -3765,7 +3836,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { Value *LoadBasePtr = LI->getPointerOperand(); Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); - DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); // Check whether we have an already split load. auto SplitLoadsMapI = SplitLoadsMap.find(LI); @@ -3775,7 +3846,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { assert(SplitLoads->size() == Offsets.Splits.size() + 1 && "Too few split loads for the number of splits in the store!"); } else { - DEBUG(dbgs() << " of load: " << *LI << "\n"); + LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n"); } uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); @@ -3794,7 +3865,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { auto AS = LI->getPointerAddressSpace(); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, - APInt(DL.getPointerSizeInBits(AS), PartOffset), + APInt(DL.getIndexSizeInBits(AS), PartOffset), LoadPartPtrTy, LoadBasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); @@ -3806,7 +3877,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, - APInt(DL.getPointerSizeInBits(AS), PartOffset), + APInt(DL.getIndexSizeInBits(AS), PartOffset), StorePartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); @@ -3815,11 +3886,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PStore->getOperandUse(PStore->getPointerOperandIndex()), /*IsSplittable*/ false)); - DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() - << ", " << NewSlices.back().endOffset() << "): " << *PStore - << "\n"); + LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() + << "): " << *PStore << "\n"); if (!SplitLoads) { - DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); } // See if we've finished all the splits. @@ -3874,10 +3945,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // sequence. AS.insert(NewSlices); - DEBUG(dbgs() << " Pre-split slices:\n"); + LLVM_DEBUG(dbgs() << " Pre-split slices:\n"); #ifndef NDEBUG for (auto I = AS.begin(), E = AS.end(); I != E; ++I) - DEBUG(AS.print(dbgs(), I, " ")); + LLVM_DEBUG(AS.print(dbgs(), I, " ")); #endif // Finally, don't try to promote any allocas that new require re-splitting. @@ -3891,7 +3962,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { return true; } -/// \brief Rewrite an alloca partition's users. +/// Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries /// to rewrite uses of an alloca partition to be conducive for SSA value @@ -3934,10 +4005,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // exact same type as the original, and with the same access offsets. In that // case, re-use the existing alloca, but still run through the rewriter to // perform phi and select speculation. + // P.beginOffset() can be non-zero even with the same type in a case with + // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll). AllocaInst *NewAI; - if (SliceTy == AI.getAllocatedType()) { - assert(P.beginOffset() == 0 && - "Non-zero begin offset but same alloca type"); + if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) { NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. @@ -3958,12 +4029,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, NewAI = new AllocaInst( SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); + // Copy the old AI debug location over to the new one. + NewAI->setDebugLoc(AI.getDebugLoc()); ++NumNewAllocas; } - DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << P.beginOffset() << "," << P.endOffset() - << ") to: " << *NewAI << "\n"); + LLVM_DEBUG(dbgs() << "Rewriting alloca partition " + << "[" << P.beginOffset() << "," << P.endOffset() + << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in @@ -4040,7 +4113,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, return NewAI; } -/// \brief Walks the slices of an alloca and form partitions based on them, +/// Walks the slices of an alloca and form partitions based on them, /// rewriting each of their uses. bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { if (AS.begin() == AS.end()) @@ -4063,7 +4136,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType()); const uint64_t MaxBitVectorSize = 1024; - if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) { + if (AllocaSize <= MaxBitVectorSize) { // If a byte boundary is included in any load or store, a slice starting or // ending at the boundary is not splittable. SmallBitVector SplittableOffset(AllocaSize + 1, true); @@ -4106,7 +4179,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { } if (!IsSorted) - std::sort(AS.begin(), AS.end()); + llvm::sort(AS.begin(), AS.end()); /// Describes the allocas introduced by rewritePartition in order to migrate /// the debug info. @@ -4201,7 +4274,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { return Changed; } -/// \brief Clobber a use with undef, deleting the used value if it becomes dead. +/// Clobber a use with undef, deleting the used value if it becomes dead. void SROA::clobberUse(Use &U) { Value *OldV = U; // Replace the use with an undef value. @@ -4216,13 +4289,13 @@ void SROA::clobberUse(Use &U) { } } -/// \brief Analyze an alloca for SROA. +/// Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds /// the slices of the alloca, and then hands it off to be split and /// rewritten as needed. bool SROA::runOnAlloca(AllocaInst &AI) { - DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); + LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); ++NumAllocasAnalyzed; // Special case dead allocas, as they're trivial. @@ -4246,7 +4319,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // Build the slices using a recursive instruction-visiting builder. AllocaSlices AS(DL, AI); - DEBUG(AS.print(dbgs())); + LLVM_DEBUG(AS.print(dbgs())); if (AS.isEscaped()) return Changed; @@ -4274,18 +4347,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) { Changed |= splitAlloca(AI, AS); - DEBUG(dbgs() << " Speculating PHIs\n"); + LLVM_DEBUG(dbgs() << " Speculating PHIs\n"); while (!SpeculatablePHIs.empty()) speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); - DEBUG(dbgs() << " Speculating Selects\n"); + LLVM_DEBUG(dbgs() << " Speculating Selects\n"); while (!SpeculatableSelects.empty()) speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); return Changed; } -/// \brief Delete the dead instructions accumulated in this run. +/// Delete the dead instructions accumulated in this run. /// /// Recursively deletes the dead instructions we've accumulated. This is done /// at the very end to maximize locality of the recursive delete and to @@ -4299,7 +4372,7 @@ bool SROA::deleteDeadInstructions( bool Changed = false; while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); - DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); + LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); // If the instruction is an alloca, find the possible dbg.declare connected // to it, and remove it too. We must do this before calling RAUW or we will @@ -4327,7 +4400,7 @@ bool SROA::deleteDeadInstructions( return Changed; } -/// \brief Promote the allocas, using the best available technique. +/// Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. @@ -4338,7 +4411,7 @@ bool SROA::promoteAllocas(Function &F) { NumPromoted += PromotableAllocas.size(); - DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); PromoteMemToReg(PromotableAllocas, *DT, AC); PromotableAllocas.clear(); return true; @@ -4346,7 +4419,7 @@ bool SROA::promoteAllocas(Function &F) { PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, AssumptionCache &RunAC) { - DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 3b99ddff2e06..526487d3477e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -45,6 +45,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeScalarizerPass(Registry); initializeDSELegacyPassPass(Registry); initializeGuardWideningLegacyPassPass(Registry); + initializeLoopGuardWideningLegacyPassPass(Registry); initializeGVNLegacyPassPass(Registry); initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); @@ -52,9 +53,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); - initializeInductiveRangeCheckEliminationPass(Registry); + initializeIRCELegacyPassPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); initializeInferAddressSpacesPass(Registry); + initializeInstSimplifyLegacyPassPass(Registry); initializeJumpThreadingPass(Registry); initializeLegacyLICMPassPass(Registry); initializeLegacyLoopSinkPassPass(Registry); @@ -68,6 +70,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); + initializeLoopUnrollAndJamPass(Registry); initializeLoopUnswitchPass(Registry); initializeLoopVersioningLICMPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); @@ -83,7 +86,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRegToMemPass(Registry); initializeRewriteStatepointsForGCLegacyPassPass(Registry); initializeSCCPLegacyPassPass(Registry); - initializeIPSCCPLegacyPassPass(Registry); initializeSROALegacyPassPass(Registry); initializeCFGSimplifyPassPass(Registry); initializeStructurizeCFGPass(Registry); @@ -104,6 +106,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePostInlineEntryExitInstrumenterPass(Registry); } +void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopSimplifyCFGPass()); +} + void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { initializeScalarOpts(*unwrap(R)); } @@ -148,10 +154,6 @@ void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createIndVarSimplifyPass()); } -void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createInstructionCombiningPass()); -} - void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createJumpThreadingPass()); } @@ -180,14 +182,14 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRerollPass()); } -void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLoopSimplifyCFGPass()); -} - void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } +void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopUnrollAndJamPass()); +} + void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnswitchPass()); } @@ -200,14 +202,6 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPartiallyInlineLibCallsPass()); } -void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLowerSwitchPass()); -} - -void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createPromoteMemoryToRegisterPass()); -} - void LLVMAddReassociatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createReassociatePass()); } diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4a96e0ddca16..967f4a42a8fb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -165,8 +165,8 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -190,7 +190,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <string> @@ -213,7 +212,7 @@ static cl::opt<bool> namespace { -/// \brief A helper class for separating a constant offset from a GEP index. +/// A helper class for separating a constant offset from a GEP index. /// /// In real programs, a GEP index may be more complicated than a simple addition /// of something and a constant integer which can be trivially splitted. For @@ -340,16 +339,15 @@ private: const DominatorTree *DT; }; -/// \brief A pass that tries to split every GEP in the function into a variadic +/// A pass that tries to split every GEP in the function into a variadic /// base and a constant offset. It is a FunctionPass because searching for the /// constant offset may inspect other basic blocks. class SeparateConstOffsetFromGEP : public FunctionPass { public: static char ID; - SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, - bool LowerGEP = false) - : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { + SeparateConstOffsetFromGEP(bool LowerGEP = false) + : FunctionPass(ID), LowerGEP(LowerGEP) { initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); } @@ -450,7 +448,6 @@ private: const DataLayout *DL = nullptr; DominatorTree *DT = nullptr; ScalarEvolution *SE; - const TargetMachine *TM; LoopInfo *LI; TargetLibraryInfo *TLI; @@ -480,10 +477,8 @@ INITIALIZE_PASS_END( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -FunctionPass * -llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM, - bool LowerGEP) { - return new SeparateConstOffsetFromGEP(TM, LowerGEP); +FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { + return new SeparateConstOffsetFromGEP(LowerGEP); } bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, @@ -502,6 +497,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). + // FIXME: this does not appear to be covered by any tests + // (with x86/aarch64 backends at least) if (BO->getOpcode() == Instruction::Or && !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT)) return false; @@ -590,6 +587,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + } else if (isa<TruncInst>(V)) { + ConstantOffset = + find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) + .trunc(BitWidth); } else if (isa<SExtInst>(V)) { ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, ZeroExtended, NonNegative).sext(BitWidth); @@ -654,8 +655,9 @@ ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { } if (CastInst *Cast = dyn_cast<CastInst>(U)) { - assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) && - "We only traced into two types of CastInst: sext and zext"); + assert( + (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && + "Only following instructions can be traced: sext, zext & trunc"); ExtInsts.push_back(Cast); UserChain[ChainIndex] = nullptr; return distributeExtsAndCloneChain(ChainIndex - 1); @@ -706,7 +708,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { BinaryOperator::BinaryOps NewOp = BO->getOpcode(); if (BO->getOpcode() == Instruction::Or) { // Rebuild "or" as "add", because "or" may be invalid for the new - // epxression. + // expression. // // For instance, given // a | (b + 5) where a and b + 5 have no common bits, @@ -943,6 +945,10 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (!NeedsExtraction) return Changed; + + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*GEP->getFunction()); + // If LowerGEP is disabled, before really splitting the GEP, check whether the // backend supports the addressing mode we are about to produce. If no, this // splitting probably won't be beneficial. @@ -951,9 +957,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // of variable indices. Therefore, we don't check for addressing modes in that // case. if (!LowerGEP) { - TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *GEP->getParent()->getParent()); unsigned AddrSpace = GEP->getPointerAddressSpace(); if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset, @@ -1016,7 +1019,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (LowerGEP) { // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) + if (TTI.useAA()) lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); else lowerToArithmetics(GEP, AccumulativeByteOffset); @@ -1065,7 +1068,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { DL->getTypeAllocSize(GEP->getResultElementType())); Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { - // Very likely. As long as %gep is natually aligned, the byte offset we + // Very likely. As long as %gep is naturally aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, @@ -1295,7 +1298,7 @@ void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, // We changed p+o+c to p+c+o, p+c may not be inbound anymore. const DataLayout &DAL = First->getModule()->getDataLayout(); - APInt Offset(DAL.getPointerSizeInBits( + APInt Offset(DAL.getIndexSizeInBits( cast<PointerType>(First->getType())->getAddressSpace()), 0); Value *NewBase = diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index aba732bc413f..34510cb40732 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -1,4 +1,4 @@ -//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// +///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// // // The LLVM Compiler Infrastructure // @@ -17,10 +17,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -66,180 +70,65 @@ static cl::opt<int> UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, cl::desc("The cost threshold for unswitching a loop.")); -static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, - Constant &Replacement) { - assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); - - // Replace uses of LIC in the loop with the given constant. - for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) { - // Grab the use and walk past it so we can clobber it in the use list. - Use *U = &*UI++; - Instruction *UserI = dyn_cast<Instruction>(U->getUser()); - if (!UserI || !L.contains(UserI)) - continue; - - // Replace this use within the loop body. - *U = &Replacement; - } -} - -/// Update the IDom for a basic block whose predecessor set has changed. -/// -/// This routine is designed to work when the domtree update is relatively -/// localized by leveraging a known common dominator, often a loop header. +/// Collect all of the loop invariant input values transitively used by the +/// homogeneous instruction graph from a given root. /// -/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS -/// approach here as we can do that easily by persisting the candidate IDom's -/// dominating set between each predecessor. -/// -/// FIXME: Longer term, many uses of this can be replaced by an incremental -/// domtree update strategy that starts from a known dominating block and -/// rebuilds that subtree. -static bool updateIDomWithKnownCommonDominator(BasicBlock *BB, - BasicBlock *KnownDominatingBB, - DominatorTree &DT) { - assert(pred_begin(BB) != pred_end(BB) && - "This routine does not handle unreachable blocks!"); - - BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock(); - - BasicBlock *IDom = *pred_begin(BB); - assert(DT.dominates(KnownDominatingBB, IDom) && - "Bad known dominating block!"); - - // Walk all of the other predecessors finding the nearest common dominator - // until all predecessors are covered or we reach the loop header. The loop - // header necessarily dominates all loop exit blocks in loop simplified form - // so we can early-exit the moment we hit that block. - for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB); - PI != PE && IDom != KnownDominatingBB; ++PI) { - assert(DT.dominates(KnownDominatingBB, *PI) && - "Bad known dominating block!"); - IDom = DT.findNearestCommonDominator(IDom, *PI); - } +/// This essentially walks from a root recursively through loop variant operands +/// which have the exact same opcode and finds all inputs which are loop +/// invariant. For some operations these can be re-associated and unswitched out +/// of the loop entirely. +static TinyPtrVector<Value *> +collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, + LoopInfo &LI) { + assert(!L.isLoopInvariant(&Root) && + "Only need to walk the graph if root itself is not invariant."); + TinyPtrVector<Value *> Invariants; + + // Build a worklist and recurse through operators collecting invariants. + SmallVector<Instruction *, 4> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(&Root); + Visited.insert(&Root); + do { + Instruction &I = *Worklist.pop_back_val(); + for (Value *OpV : I.operand_values()) { + // Skip constants as unswitching isn't interesting for them. + if (isa<Constant>(OpV)) + continue; - if (IDom == OrigIDom) - return false; + // Add it to our result if loop invariant. + if (L.isLoopInvariant(OpV)) { + Invariants.push_back(OpV); + continue; + } - DT.changeImmediateDominator(BB, IDom); - return true; -} + // If not an instruction with the same opcode, nothing we can do. + Instruction *OpI = dyn_cast<Instruction>(OpV); + if (!OpI || OpI->getOpcode() != Root.getOpcode()) + continue; -// Note that we don't currently use the IDFCalculator here for two reasons: -// 1) It computes dominator tree levels for the entire function on each run -// of 'compute'. While this isn't terrible, given that we expect to update -// relatively small subtrees of the domtree, it isn't necessarily the right -// tradeoff. -// 2) The interface doesn't fit this usage well. It doesn't operate in -// append-only, and builds several sets that we don't need. -// -// FIXME: Neither of these issues are a big deal and could be addressed with -// some amount of refactoring of IDFCalculator. That would allow us to share -// the core logic here (which is solving the same core problem). -static void appendDomFrontier(DomTreeNode *Node, - SmallSetVector<BasicBlock *, 4> &Worklist, - SmallVectorImpl<DomTreeNode *> &DomNodes, - SmallPtrSetImpl<BasicBlock *> &DomSet) { - assert(DomNodes.empty() && "Must start with no dominator nodes."); - assert(DomSet.empty() && "Must start with an empty dominator set."); - - // First flatten this subtree into sequence of nodes by doing a pre-order - // walk. - DomNodes.push_back(Node); - // We intentionally re-evaluate the size as each node can add new children. - // Because this is a tree walk, this cannot add any duplicates. - for (int i = 0; i < (int)DomNodes.size(); ++i) - DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); - - // Now create a set of the basic blocks so we can quickly test for - // dominated successors. We could in theory use the DFS numbers of the - // dominator tree for this, but we want this to remain predictably fast - // even while we mutate the dominator tree in ways that would invalidate - // the DFS numbering. - for (DomTreeNode *InnerN : DomNodes) - DomSet.insert(InnerN->getBlock()); - - // Now re-walk the nodes, appending every successor of every node that isn't - // in the set. Note that we don't append the node itself, even though if it - // is a successor it does not strictly dominate itself and thus it would be - // part of the dominance frontier. The reason we don't append it is that - // the node passed in came *from* the worklist and so it has already been - // processed. - for (DomTreeNode *InnerN : DomNodes) - for (BasicBlock *SuccBB : successors(InnerN->getBlock())) - if (!DomSet.count(SuccBB)) - Worklist.insert(SuccBB); - - DomNodes.clear(); - DomSet.clear(); -} + // Visit this operand. + if (Visited.insert(OpI).second) + Worklist.push_back(OpI); + } + } while (!Worklist.empty()); -/// Update the dominator tree after unswitching a particular former exit block. -/// -/// This handles the full update of the dominator tree after hoisting a block -/// that previously was an exit block (or split off of an exit block) up to be -/// reached from the new immediate dominator of the preheader. -/// -/// The common case is simple -- we just move the unswitched block to have an -/// immediate dominator of the old preheader. But in complex cases, there may -/// be other blocks reachable from the unswitched block that are immediately -/// dominated by some node between the unswitched one and the old preheader. -/// All of these also need to be hoisted in the dominator tree. We also want to -/// minimize queries to the dominator tree because each step of this -/// invalidates any DFS numbers that would make queries fast. -static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, - DominatorTree &DT) { - DomTreeNode *OldPHNode = DT[OldPH]; - DomTreeNode *UnswitchedNode = DT[UnswitchedBB]; - // If the dominator tree has already been updated for this unswitched node, - // we're done. This makes it easier to use this routine if there are multiple - // paths to the same unswitched destination. - if (UnswitchedNode->getIDom() == OldPHNode) - return; + return Invariants; +} - // First collect the domtree nodes that we are hoisting over. These are the - // set of nodes which may have children that need to be hoisted as well. - SmallPtrSet<DomTreeNode *, 4> DomChain; - for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode; - IDom = IDom->getIDom()) - DomChain.insert(IDom); - - // The unswitched block ends up immediately dominated by the old preheader -- - // regardless of whether it is the loop exit block or split off of the loop - // exit block. - DT.changeImmediateDominator(UnswitchedNode, OldPHNode); - - // For everything that moves up the dominator tree, we need to examine the - // dominator frontier to see if it additionally should move up the dominator - // tree. This lambda appends the dominator frontier for a node on the - // worklist. - SmallSetVector<BasicBlock *, 4> Worklist; - - // Scratch data structures reused by domfrontier finding. - SmallVector<DomTreeNode *, 4> DomNodes; - SmallPtrSet<BasicBlock *, 4> DomSet; - - // Append the initial dom frontier nodes. - appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet); - - // Walk the worklist. We grow the list in the loop and so must recompute size. - for (int i = 0; i < (int)Worklist.size(); ++i) { - auto *BB = Worklist[i]; - - DomTreeNode *Node = DT[BB]; - assert(!DomChain.count(Node) && - "Cannot be dominated by a block you can reach!"); - - // If this block had an immediate dominator somewhere in the chain - // we hoisted over, then its position in the domtree needs to move as it is - // reachable from a node hoisted over this chain. - if (!DomChain.count(Node->getIDom())) - continue; +static void replaceLoopInvariantUses(Loop &L, Value *Invariant, + Constant &Replacement) { + assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?"); - DT.changeImmediateDominator(Node, OldPHNode); + // Replace uses of LIC in the loop with the given constant. + for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) { + // Grab the use and walk past it so we can clobber it in the use list. + Use *U = &*UI++; + Instruction *UserI = dyn_cast<Instruction>(U->getUser()); - // Now add this node's dominator frontier to the worklist as well. - appendDomFrontier(Node, Worklist, DomNodes, DomSet); + // Replace this use within the loop body. + if (UserI && L.contains(UserI)) + U->set(&Replacement); } } @@ -261,6 +150,26 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, llvm_unreachable("Basic blocks should never be empty!"); } +/// Insert code to test a set of loop invariant values, and conditionally branch +/// on them. +static void buildPartialUnswitchConditionalBranch(BasicBlock &BB, + ArrayRef<Value *> Invariants, + bool Direction, + BasicBlock &UnswitchedSucc, + BasicBlock &NormalSucc) { + IRBuilder<> IRB(&BB); + Value *Cond = Invariants.front(); + for (Value *Invariant : + make_range(std::next(Invariants.begin()), Invariants.end())) + if (Direction) + Cond = IRB.CreateOr(Cond, Invariant); + else + Cond = IRB.CreateAnd(Cond, Invariant); + + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc); +} + /// Rewrite the PHI nodes in an unswitched loop exit basic block. /// /// Requires that the loop exit and unswitched basic block are the same, and @@ -293,7 +202,8 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, BasicBlock &UnswitchedBB, BasicBlock &OldExitingBB, - BasicBlock &OldPH) { + BasicBlock &OldPH, + bool FullUnswitch) { assert(&ExitBB != &UnswitchedBB && "Must have different loop exit and unswitched blocks!"); Instruction *InsertPt = &*UnswitchedBB.begin(); @@ -314,7 +224,11 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, if (PN.getIncomingBlock(i) != &OldExitingBB) continue; - Value *Incoming = PN.removeIncomingValue(i); + Value *Incoming = PN.getIncomingValue(i); + if (FullUnswitch) + // No more edge from the old exiting block to the exit block. + PN.removeIncomingValue(i); + NewPN->addIncoming(Incoming, &OldPH); } @@ -325,6 +239,76 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, } } +/// Hoist the current loop up to the innermost loop containing a remaining exit. +/// +/// Because we've removed an exit from the loop, we may have changed the set of +/// loops reachable and need to move the current loop up the loop nest or even +/// to an entirely separate nest. +static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, + DominatorTree &DT, LoopInfo &LI) { + // If the loop is already at the top level, we can't hoist it anywhere. + Loop *OldParentL = L.getParentLoop(); + if (!OldParentL) + return; + + SmallVector<BasicBlock *, 4> Exits; + L.getExitBlocks(Exits); + Loop *NewParentL = nullptr; + for (auto *ExitBB : Exits) + if (Loop *ExitL = LI.getLoopFor(ExitBB)) + if (!NewParentL || NewParentL->contains(ExitL)) + NewParentL = ExitL; + + if (NewParentL == OldParentL) + return; + + // The new parent loop (if different) should always contain the old one. + if (NewParentL) + assert(NewParentL->contains(OldParentL) && + "Can only hoist this loop up the nest!"); + + // The preheader will need to move with the body of this loop. However, + // because it isn't in this loop we also need to update the primary loop map. + assert(OldParentL == LI.getLoopFor(&Preheader) && + "Parent loop of this loop should contain this loop's preheader!"); + LI.changeLoopFor(&Preheader, NewParentL); + + // Remove this loop from its old parent. + OldParentL->removeChildLoop(&L); + + // Add the loop either to the new parent or as a top-level loop. + if (NewParentL) + NewParentL->addChildLoop(&L); + else + LI.addTopLevelLoop(&L); + + // Remove this loops blocks from the old parent and every other loop up the + // nest until reaching the new parent. Also update all of these + // no-longer-containing loops to reflect the nesting change. + for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; + OldContainingL = OldContainingL->getParentLoop()) { + llvm::erase_if(OldContainingL->getBlocksVector(), + [&](const BasicBlock *BB) { + return BB == &Preheader || L.contains(BB); + }); + + OldContainingL->getBlocksSet().erase(&Preheader); + for (BasicBlock *BB : L.blocks()) + OldContainingL->getBlocksSet().erase(BB); + + // Because we just hoisted a loop out of this one, we have essentially + // created new exit paths from it. That means we need to form LCSSA PHI + // nodes for values used in the no-longer-nested loop. + formLCSSA(*OldContainingL, DT, &LI, nullptr); + + // We shouldn't need to form dedicated exits because the exit introduced + // here is the (just split by unswitching) preheader. As such, it is + // necessarily dedicated. + assert(OldContainingL->hasDedicatedExits() && + "Unexpected predecessor of hoisted loop preheader!"); + } +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -339,48 +323,83 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, /// (splitting the exit block as necessary). It simplifies the branch within /// the loop to an unconditional branch but doesn't remove it entirely. Further /// cleanup can be done with some simplify-cfg like pass. +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, - LoopInfo &LI) { + LoopInfo &LI, ScalarEvolution *SE) { assert(BI.isConditional() && "Can only unswitch a conditional branch!"); - DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); + LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); - Value *LoopCond = BI.getCondition(); + // The loop invariant values that we want to unswitch. + TinyPtrVector<Value *> Invariants; - // Need a trivial loop condition to unswitch. - if (!L.isLoopInvariant(LoopCond)) - return false; + // When true, we're fully unswitching the branch rather than just unswitching + // some input conditions to the branch. + bool FullUnswitch = false; + + if (L.isLoopInvariant(BI.getCondition())) { + Invariants.push_back(BI.getCondition()); + FullUnswitch = true; + } else { + if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition())) + Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI); + if (Invariants.empty()) + // Couldn't find invariant inputs! + return false; + } - // FIXME: We should compute this once at the start and update it! - SmallVector<BasicBlock *, 16> ExitBlocks; - L.getExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes - // it to do this. - ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext()); - ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext()); + // Check that one of the branch's successors exits, and which one. + bool ExitDirection = true; int LoopExitSuccIdx = 0; auto *LoopExitBB = BI.getSuccessor(0); - if (!ExitBlockSet.count(LoopExitBB)) { - std::swap(CondVal, Replacement); + if (L.contains(LoopExitBB)) { + ExitDirection = false; LoopExitSuccIdx = 1; LoopExitBB = BI.getSuccessor(1); - if (!ExitBlockSet.count(LoopExitBB)) + if (L.contains(LoopExitBB)) return false; } auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx); - assert(L.contains(ContinueBB) && - "Cannot have both successors exit and still be in the loop!"); - auto *ParentBB = BI.getParent(); if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) return false; - DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal - << " == " << LoopCond << "\n"); + // When unswitching only part of the branch's condition, we need the exit + // block to be reached directly from the partially unswitched input. This can + // be done when the exit block is along the true edge and the branch condition + // is a graph of `or` operations, or the exit block is along the false edge + // and the condition is a graph of `and` operations. + if (!FullUnswitch) { + if (ExitDirection) { + if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or) + return false; + } else { + if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And) + return false; + } + } + + LLVM_DEBUG({ + dbgs() << " unswitching trivial invariant conditions for: " << BI + << "\n"; + for (Value *Invariant : Invariants) { + dbgs() << " " << *Invariant << " == true"; + if (Invariant != Invariants.back()) + dbgs() << " ||"; + dbgs() << "\n"; + } + }); + + // If we have scalar evolutions, we need to invalidate them including this + // loop and the loop containing the exit block. + if (SE) { + if (Loop *ExitL = LI.getLoopFor(LoopExitBB)) + SE->forgetLoop(ExitL); + else + // Forget the entire nest as this exits the entire nest. + SE->forgetTopmostLoop(&L); + } // Split the preheader, so that we know that there is a safe place to insert // the conditional branch. We will change the preheader to have a conditional @@ -393,45 +412,73 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // unswitching. We need to split this if there are other loop predecessors. // Because the loop is in simplified form, *any* other predecessor is enough. BasicBlock *UnswitchedBB; - if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) { - (void)PredBB; - assert(PredBB == BI.getParent() && + if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { + assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && "A branch's parent isn't a predecessor!"); UnswitchedBB = LoopExitBB; } else { UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); } - // Now splice the branch to gate reaching the new preheader and re-point its - // successors. - OldPH->getInstList().splice(std::prev(OldPH->end()), - BI.getParent()->getInstList(), BI); + // Actually move the invariant uses into the unswitched position. If possible, + // we do this by moving the instructions, but when doing partial unswitching + // we do it by building a new merge of the values in the unswitched position. OldPH->getTerminator()->eraseFromParent(); - BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); - BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); - - // Create a new unconditional branch that will continue the loop as a new - // terminator. - BranchInst::Create(ContinueBB, ParentBB); + if (FullUnswitch) { + // If fully unswitching, we can use the existing branch instruction. + // Splice it into the old PH to gate reaching the new preheader and re-point + // its successors. + OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), + BI); + BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); + BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); + + // Create a new unconditional branch that will continue the loop as a new + // terminator. + BranchInst::Create(ContinueBB, ParentBB); + } else { + // Only unswitching a subset of inputs to the condition, so we will need to + // build a new branch that merges the invariant inputs. + if (ExitDirection) + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::Or && + "Must have an `or` of `i1`s for the condition!"); + else + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::And && + "Must have an `and` of `i1`s for the condition!"); + buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection, + *UnswitchedBB, *NewPH); + } // Rewrite the relevant PHI nodes. if (UnswitchedBB == LoopExitBB) rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); else rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, - *ParentBB, *OldPH); + *ParentBB, *OldPH, FullUnswitch); // Now we need to update the dominator tree. - updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); - // But if we split something off of the loop exit block then we also removed - // one of the predecessors for the loop exit block and may need to update its - // idom. - if (UnswitchedBB != LoopExitBB) - updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT); + DT.insertEdge(OldPH, UnswitchedBB); + if (FullUnswitch) + DT.deleteEdge(ParentBB, UnswitchedBB); + + // The constant we can replace all of our invariants with inside the loop + // body. If any of the invariants have a value other than this the loop won't + // be entered. + ConstantInt *Replacement = ExitDirection + ? ConstantInt::getFalse(BI.getContext()) + : ConstantInt::getTrue(BI.getContext()); // Since this is an i1 condition we can also trivially replace uses of it // within the loop with a constant. - replaceLoopUsesWithConstant(L, *LoopCond, *Replacement); + for (Value *Invariant : Invariants) + replaceLoopInvariantUses(L, Invariant, *Replacement); + + // If this was full unswitching, we may have changed the nesting relationship + // for this loop so hoist it to its correct parent if needed. + if (FullUnswitch) + hoistLoopToNewParent(L, *NewPH, DT, LI); ++NumTrivial; ++NumBranches; @@ -461,9 +508,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, /// switch will not be revisited. If after unswitching there is only a single /// in-loop successor, the switch is further simplified to an unconditional /// branch. Still more cleanup can be done with some simplify-cfg like pass. +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, - LoopInfo &LI) { - DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); + LoopInfo &LI, ScalarEvolution *SE) { + LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); Value *LoopCond = SI.getCondition(); // If this isn't switching on an invariant condition, we can't unswitch it. @@ -472,41 +522,62 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, auto *ParentBB = SI.getParent(); - // FIXME: We should compute this once at the start and update it! - SmallVector<BasicBlock *, 16> ExitBlocks; - L.getExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - SmallVector<int, 4> ExitCaseIndices; for (auto Case : SI.cases()) { auto *SuccBB = Case.getCaseSuccessor(); - if (ExitBlockSet.count(SuccBB) && + if (!L.contains(SuccBB) && areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB)) ExitCaseIndices.push_back(Case.getCaseIndex()); } BasicBlock *DefaultExitBB = nullptr; - if (ExitBlockSet.count(SI.getDefaultDest()) && + if (!L.contains(SI.getDefaultDest()) && areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) && !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator())) DefaultExitBB = SI.getDefaultDest(); else if (ExitCaseIndices.empty()) return false; - DEBUG(dbgs() << " unswitching trivial cases...\n"); + LLVM_DEBUG(dbgs() << " unswitching trivial cases...\n"); + + // We may need to invalidate SCEVs for the outermost loop reached by any of + // the exits. + Loop *OuterL = &L; + if (DefaultExitBB) { + // Clear out the default destination temporarily to allow accurate + // predecessor lists to be examined below. + SI.setDefaultDest(nullptr); + // Check the loop containing this exit. + Loop *ExitL = LI.getLoopFor(DefaultExitBB); + if (!ExitL || ExitL->contains(OuterL)) + OuterL = ExitL; + } + + // Store the exit cases into a separate data structure and remove them from + // the switch. SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases; ExitCases.reserve(ExitCaseIndices.size()); // We walk the case indices backwards so that we remove the last case first // and don't disrupt the earlier indices. for (unsigned Index : reverse(ExitCaseIndices)) { auto CaseI = SI.case_begin() + Index; + // Compute the outer loop from this exit. + Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor()); + if (!ExitL || ExitL->contains(OuterL)) + OuterL = ExitL; // Save the value of this case. ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()}); // Delete the unswitched cases. SI.removeCase(CaseI); } + if (SE) { + if (OuterL) + SE->forgetLoop(OuterL); + else + SE->forgetTopmostLoop(&L); + } + // Check if after this all of the remaining cases point at the same // successor. BasicBlock *CommonSuccBB = nullptr; @@ -517,23 +588,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SI.case_begin()->getCaseSuccessor(); })) CommonSuccBB = SI.case_begin()->getCaseSuccessor(); - - if (DefaultExitBB) { - // We can't remove the default edge so replace it with an edge to either - // the single common remaining successor (if we have one) or an unreachable - // block. - if (CommonSuccBB) { - SI.setDefaultDest(CommonSuccBB); - } else { - BasicBlock *UnreachableBB = BasicBlock::Create( - ParentBB->getContext(), - Twine(ParentBB->getName()) + ".unreachable_default", - ParentBB->getParent()); - new UnreachableInst(ParentBB->getContext(), UnreachableBB); - SI.setDefaultDest(UnreachableBB); - DT.addNewBlock(UnreachableBB, ParentBB); - } - } else { + if (!DefaultExitBB) { // If we're not unswitching the default, we need it to match any cases to // have a common successor or if we have no cases it is the common // successor. @@ -570,9 +625,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, } else { auto *SplitBB = SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, - *ParentBB, *OldPH); - updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT); + rewritePHINodesForExitAndUnswitchedBlocks( + *DefaultExitBB, *SplitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; } } @@ -597,9 +651,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!SplitExitBB) { // If this is the first time we see this, do the split and remember it. SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); - rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, - *ParentBB, *OldPH); - updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT); + rewritePHINodesForExitAndUnswitchedBlocks( + *ExitBB, *SplitExitBB, *ParentBB, *OldPH, /*FullUnswitch*/ true); } // Update the case pair to point to the split block. CasePair.second = SplitExitBB; @@ -612,14 +665,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, BasicBlock *UnswitchedBB = CasePair.second; NewSI->addCase(CaseVal, UnswitchedBB); - updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); } // If the default was unswitched, re-point it and add explicit cases for // entering the loop. if (DefaultExitBB) { NewSI->setDefaultDest(DefaultExitBB); - updateDTAfterUnswitch(DefaultExitBB, OldPH, DT); // We removed all the exit cases, so we just copy the cases to the // unswitched switch. @@ -633,11 +684,57 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // pointing at unreachable and other complexity. if (CommonSuccBB) { BasicBlock *BB = SI.getParent(); + // We may have had multiple edges to this common successor block, so remove + // them as predecessors. We skip the first one, either the default or the + // actual first case. + bool SkippedFirst = DefaultExitBB == nullptr; + for (auto Case : SI.cases()) { + assert(Case.getCaseSuccessor() == CommonSuccBB && + "Non-common successor!"); + (void)Case; + if (!SkippedFirst) { + SkippedFirst = true; + continue; + } + CommonSuccBB->removePredecessor(BB, + /*DontDeleteUselessPHIs*/ true); + } + // Now nuke the switch and replace it with a direct branch. SI.eraseFromParent(); BranchInst::Create(CommonSuccBB, BB); + } else if (DefaultExitBB) { + assert(SI.getNumCases() > 0 && + "If we had no cases we'd have a common successor!"); + // Move the last case to the default successor. This is valid as if the + // default got unswitched it cannot be reached. This has the advantage of + // being simple and keeping the number of edges from this switch to + // successors the same, and avoiding any PHI update complexity. + auto LastCaseI = std::prev(SI.case_end()); + SI.setDefaultDest(LastCaseI->getCaseSuccessor()); + SI.removeCase(LastCaseI); } - DT.verifyDomTree(); + // Walk the unswitched exit blocks and the unswitched split blocks and update + // the dominator tree based on the CFG edits. While we are walking unordered + // containers here, the API for applyUpdates takes an unordered list of + // updates and requires them to not contain duplicates. + SmallVector<DominatorTree::UpdateType, 4> DTUpdates; + for (auto *UnswitchedExitBB : UnswitchedExitBBs) { + DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB}); + DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB}); + } + for (auto SplitUnswitchedPair : SplitExitBBMap) { + auto *UnswitchedBB = SplitUnswitchedPair.second; + DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedBB}); + DTUpdates.push_back({DT.Insert, OldPH, UnswitchedBB}); + } + DT.applyUpdates(DTUpdates); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + + // We may have changed the nesting relationship for this loop so hoist it to + // its correct parent if needed. + hoistLoopToNewParent(L, *NewPH, DT, LI); + ++NumTrivial; ++NumSwitches; return true; @@ -652,8 +749,11 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, /// /// The return value indicates whether anything was unswitched (and therefore /// changed). +/// +/// If `SE` is not null, it will be updated based on the potential loop SCEVs +/// invalidated by this. static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, - LoopInfo &LI) { + LoopInfo &LI, ScalarEvolution *SE) { bool Changed = false; // If loop header has only one reachable successor we should keep looking for @@ -687,8 +787,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, if (isa<Constant>(SI->getCondition())) return Changed; - if (!unswitchTrivialSwitch(L, *SI, DT, LI)) - // Coludn't unswitch this one so we're done. + if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE)) + // Couldn't unswitch this one so we're done. return Changed; // Mark that we managed to unswitch something. @@ -719,17 +819,19 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, // Found a trivial condition candidate: non-foldable conditional branch. If // we fail to unswitch this, we can't do anything else that is trivial. - if (!unswitchTrivialBranch(L, *BI, DT, LI)) + if (!unswitchTrivialBranch(L, *BI, DT, LI, SE)) return Changed; // Mark that we managed to unswitch something. Changed = true; - // We unswitched the branch. This should always leave us with an - // unconditional branch that we can follow now. + // If we only unswitched some of the conditions feeding the branch, we won't + // have collapsed it to a single successor. BI = cast<BranchInst>(CurrentBB->getTerminator()); - assert(!BI->isConditional() && - "Cannot form a conditional branch by unswitching1"); + if (BI->isConditional()) + return Changed; + + // Follow the newly unconditional branch into its successor. CurrentBB = BI->getSuccessor(0); // When continuing, if we exit the loop or reach a previous visited block, @@ -748,8 +850,12 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, /// /// This routine handles cloning all of the necessary loop blocks and exit /// blocks including rewriting their instructions and the relevant PHI nodes. -/// It skips loop and exit blocks that are not necessary based on the provided -/// set. It also correctly creates the unconditional branch in the cloned +/// Any loop blocks or exit blocks which are dominated by a different successor +/// than the one for this clone of the loop blocks can be trivially skipped. We +/// use the `DominatingSucc` map to determine whether a block satisfies that +/// property with a simple map lookup. +/// +/// It also correctly creates the unconditional branch in the cloned /// unswitched parent block to only point at the unswitched successor. /// /// This does not handle most of the necessary updates to `LoopInfo`. Only exit @@ -763,9 +869,10 @@ static BasicBlock *buildClonedLoopBlocks( Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, - const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks, - ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT, - LoopInfo &LI) { + const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, + ValueToValueMapTy &VMap, + SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, + DominatorTree &DT, LoopInfo &LI) { SmallVector<BasicBlock *, 4> NewBlocks; NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size()); @@ -780,26 +887,29 @@ static BasicBlock *buildClonedLoopBlocks( NewBlocks.push_back(NewBB); VMap[OldBB] = NewBB; - // Add the block to the domtree. We'll move it to the correct position - // below. - DT.addNewBlock(NewBB, SplitBB); - return NewBB; }; + // We skip cloning blocks when they have a dominating succ that is not the + // succ we are cloning for. + auto SkipBlock = [&](BasicBlock *BB) { + auto It = DominatingSucc.find(BB); + return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; + }; + // First, clone the preheader. auto *ClonedPH = CloneBlock(LoopPH); // Then clone all the loop blocks, skipping the ones that aren't necessary. for (auto *LoopBB : L.blocks()) - if (!SkippedLoopAndExitBlocks.count(LoopBB)) + if (!SkipBlock(LoopBB)) CloneBlock(LoopBB); // Split all the loop exit edges so that when we clone the exit blocks, if // any of the exit blocks are *also* a preheader for some other loop, we // don't create multiple predecessors entering the loop header. for (auto *ExitBB : ExitBlocks) { - if (SkippedLoopAndExitBlocks.count(ExitBB)) + if (SkipBlock(ExitBB)) continue; // When we are going to clone an exit, we don't need to clone all the @@ -822,17 +932,6 @@ static BasicBlock *buildClonedLoopBlocks( assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && "Cloned exit block has the wrong successor!"); - // Move the merge block's idom to be the split point as one exit is - // dominated by one header, and the other by another, so we know the split - // point dominates both. While the dominator tree isn't fully accurate, we - // want sub-trees within the original loop to be correctly reflect - // dominance within that original loop (at least) and that requires moving - // the merge block out of that subtree. - // FIXME: This is very brittle as we essentially have a partial contract on - // the dominator tree. We really need to instead update it and keep it - // valid or stop relying on it. - DT.changeImmediateDominator(MergeBB, SplitBB); - // Remap any cloned instructions and create a merge phi node for them. for (auto ZippedInsts : llvm::zip_first( llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())), @@ -872,28 +971,63 @@ static BasicBlock *buildClonedLoopBlocks( AC.registerAssumption(II); } - // Remove the cloned parent as a predecessor of the cloned continue successor - // if we did in fact clone it. - auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); - if (auto *ClonedContinueSuccBB = - cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB))) - ClonedContinueSuccBB->removePredecessor(ClonedParentBB, - /*DontDeleteUselessPHIs*/ true); - // Replace the cloned branch with an unconditional branch to the cloneed - // unswitched successor. - auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); - ClonedParentBB->getTerminator()->eraseFromParent(); - BranchInst::Create(ClonedSuccBB, ClonedParentBB); - // Update any PHI nodes in the cloned successors of the skipped blocks to not // have spurious incoming values. for (auto *LoopBB : L.blocks()) - if (SkippedLoopAndExitBlocks.count(LoopBB)) + if (SkipBlock(LoopBB)) for (auto *SuccBB : successors(LoopBB)) if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB))) for (PHINode &PN : ClonedSuccBB->phis()) PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false); + // Remove the cloned parent as a predecessor of any successor we ended up + // cloning other than the unswitched one. + auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB)); + for (auto *SuccBB : successors(ParentBB)) { + if (SuccBB == UnswitchedSuccBB) + continue; + + auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)); + if (!ClonedSuccBB) + continue; + + ClonedSuccBB->removePredecessor(ClonedParentBB, + /*DontDeleteUselessPHIs*/ true); + } + + // Replace the cloned branch with an unconditional branch to the cloned + // unswitched successor. + auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB)); + ClonedParentBB->getTerminator()->eraseFromParent(); + BranchInst::Create(ClonedSuccBB, ClonedParentBB); + + // If there are duplicate entries in the PHI nodes because of multiple edges + // to the unswitched successor, we need to nuke all but one as we replaced it + // with a direct branch. + for (PHINode &PN : ClonedSuccBB->phis()) { + bool Found = false; + // Loop over the incoming operands backwards so we can easily delete as we + // go without invalidating the index. + for (int i = PN.getNumOperands() - 1; i >= 0; --i) { + if (PN.getIncomingBlock(i) != ClonedParentBB) + continue; + if (!Found) { + Found = true; + continue; + } + PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false); + } + } + + // Record the domtree updates for the new blocks. + SmallPtrSet<BasicBlock *, 4> SuccSet; + for (auto *ClonedBB : NewBlocks) { + for (auto *SuccBB : successors(ClonedBB)) + if (SuccSet.insert(SuccBB).second) + DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB}); + SuccSet.clear(); + } + return ClonedPH; } @@ -911,11 +1045,8 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, for (auto *BB : OrigL.blocks()) { auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB)); ClonedL.addBlockEntry(ClonedBB); - if (LI.getLoopFor(BB) == &OrigL) { - assert(!LI.getLoopFor(ClonedBB) && - "Should not have an existing loop for this block!"); + if (LI.getLoopFor(BB) == &OrigL) LI.changeLoopFor(ClonedBB, &ClonedL); - } } }; @@ -965,9 +1096,9 @@ static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, /// original loop, multiple cloned sibling loops may be created. All of them /// are returned so that the newly introduced loop nest roots can be /// identified. -static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, - const ValueToValueMapTy &VMap, LoopInfo &LI, - SmallVectorImpl<Loop *> &NonChildClonedLoops) { +static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, + const ValueToValueMapTy &VMap, LoopInfo &LI, + SmallVectorImpl<Loop *> &NonChildClonedLoops) { Loop *ClonedL = nullptr; auto *OrigPH = OrigL.getLoopPreheader(); @@ -1060,6 +1191,7 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, } else { LI.addTopLevelLoop(ClonedL); } + NonChildClonedLoops.push_back(ClonedL); ClonedL->reserveBlocks(BlocksInClonedLoop.size()); // We don't want to just add the cloned loop blocks based on how we @@ -1128,11 +1260,11 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, // matter as we're just trying to build up the map from inside-out; we use // the map in a more stably ordered way below. auto OrderedClonedExitsInLoops = ClonedExitsInLoops; - std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(), - [&](BasicBlock *LHS, BasicBlock *RHS) { - return ExitLoopMap.lookup(LHS)->getLoopDepth() < - ExitLoopMap.lookup(RHS)->getLoopDepth(); - }); + llvm::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(), + [&](BasicBlock *LHS, BasicBlock *RHS) { + return ExitLoopMap.lookup(LHS)->getLoopDepth() < + ExitLoopMap.lookup(RHS)->getLoopDepth(); + }); // Populate the existing ExitLoopMap with everything reachable from each // exit, starting from the inner most exit. @@ -1212,60 +1344,69 @@ static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, NonChildClonedLoops.push_back(cloneLoopNest( *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI)); } +} + +static void +deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, + ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, + DominatorTree &DT) { + // Find all the dead clones, and remove them from their successors. + SmallVector<BasicBlock *, 16> DeadBlocks; + for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) + for (auto &VMap : VMaps) + if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB))) + if (!DT.isReachableFromEntry(ClonedBB)) { + for (BasicBlock *SuccBB : successors(ClonedBB)) + SuccBB->removePredecessor(ClonedBB); + DeadBlocks.push_back(ClonedBB); + } - // Return the main cloned loop if any. - return ClonedL; + // Drop any remaining references to break cycles. + for (BasicBlock *BB : DeadBlocks) + BB->dropAllReferences(); + // Erase them from the IR. + for (BasicBlock *BB : DeadBlocks) + BB->eraseFromParent(); } -static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot, - SmallVectorImpl<BasicBlock *> &ExitBlocks, - DominatorTree &DT, LoopInfo &LI) { - // Walk the dominator tree to build up the set of blocks we will delete here. - // The order is designed to allow us to always delete bottom-up and avoid any - // dangling uses. - SmallSetVector<BasicBlock *, 16> DeadBlocks; - DeadBlocks.insert(DeadSubtreeRoot); - for (int i = 0; i < (int)DeadBlocks.size(); ++i) - for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) { - // FIXME: This assert should pass and that means we don't change nearly - // as much below! Consider rewriting all of this to avoid deleting - // blocks. They are always cloned before being deleted, and so instead - // could just be moved. - // FIXME: This in turn means that we might actually be more able to - // update the domtree. - assert((L.contains(ChildN->getBlock()) || - llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) && - "Should never reach beyond the loop and exits when deleting!"); - DeadBlocks.insert(ChildN->getBlock()); +static void +deleteDeadBlocksFromLoop(Loop &L, + SmallVectorImpl<BasicBlock *> &ExitBlocks, + DominatorTree &DT, LoopInfo &LI) { + // Find all the dead blocks, and remove them from their successors. + SmallVector<BasicBlock *, 16> DeadBlocks; + for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks)) + if (!DT.isReachableFromEntry(BB)) { + for (BasicBlock *SuccBB : successors(BB)) + SuccBB->removePredecessor(BB); + DeadBlocks.push_back(BB); } + SmallPtrSet<BasicBlock *, 16> DeadBlockSet(DeadBlocks.begin(), + DeadBlocks.end()); + // Filter out the dead blocks from the exit blocks list so that it can be // used in the caller. llvm::erase_if(ExitBlocks, - [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); - - // Remove these blocks from their successors. - for (auto *BB : DeadBlocks) - for (BasicBlock *SuccBB : successors(BB)) - SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true); + [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); // Walk from this loop up through its parents removing all of the dead blocks. for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { for (auto *BB : DeadBlocks) ParentL->getBlocksSet().erase(BB); llvm::erase_if(ParentL->getBlocksVector(), - [&](BasicBlock *BB) { return DeadBlocks.count(BB); }); + [&](BasicBlock *BB) { return DeadBlockSet.count(BB); }); } // Now delete the dead child loops. This raw delete will clear them // recursively. llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) { - if (!DeadBlocks.count(ChildL->getHeader())) + if (!DeadBlockSet.count(ChildL->getHeader())) return false; assert(llvm::all_of(ChildL->blocks(), [&](BasicBlock *ChildBB) { - return DeadBlocks.count(ChildBB); + return DeadBlockSet.count(ChildBB); }) && "If the child loop header is dead all blocks in the child loop must " "be dead as well!"); @@ -1273,19 +1414,20 @@ static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot, return true; }); - // Remove the mappings for the dead blocks. - for (auto *BB : DeadBlocks) + // Remove the loop mappings for the dead blocks and drop all the references + // from these blocks to others to handle cyclic references as we start + // deleting the blocks themselves. + for (auto *BB : DeadBlocks) { + // Check that the dominator tree has already been updated. + assert(!DT.getNode(BB) && "Should already have cleared domtree!"); LI.changeLoopFor(BB, nullptr); - - // Drop all the references from these blocks to others to handle cyclic - // references as we start deleting the blocks themselves. - for (auto *BB : DeadBlocks) BB->dropAllReferences(); + } - for (auto *BB : llvm::reverse(DeadBlocks)) { - DT.eraseNode(BB); + // Actually delete the blocks now that they've been fully unhooked from the + // IR. + for (auto *BB : DeadBlocks) BB->eraseFromParent(); - } } /// Recompute the set of blocks in a loop after unswitching. @@ -1333,14 +1475,15 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, if (LoopBlockSet.empty()) return LoopBlockSet; - // Add the loop header to the set. - LoopBlockSet.insert(Header); - // We found backedges, recurse through them to identify the loop blocks. while (!Worklist.empty()) { BasicBlock *BB = Worklist.pop_back_val(); assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!"); + // No need to walk past the header. + if (BB == Header) + continue; + // Because we know the inner loop structure remains valid we can use the // loop structure to jump immediately across the entire nested loop. // Further, because it is in loop simplified form, we can directly jump @@ -1361,9 +1504,10 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, continue; // Insert all of the blocks (other than those already present) into - // the loop set. The only block we expect to already be in the set is - // the one we used to find this loop as we immediately handle the - // others the first time we encounter the loop. + // the loop set. We expect at least the block that led us to find the + // inner loop to be in the block set, but we may also have other loop + // blocks if they were already enqueued as predecessors of some other + // outer loop block. for (auto *InnerBB : InnerL->blocks()) { if (InnerBB == BB) { assert(LoopBlockSet.count(InnerBB) && @@ -1371,9 +1515,7 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, continue; } - bool Inserted = LoopBlockSet.insert(InnerBB).second; - (void)Inserted; - assert(Inserted && "Should only insert an inner loop once!"); + LoopBlockSet.insert(InnerBB); } // Add the preheader to the worklist so we will continue past the @@ -1389,6 +1531,8 @@ static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, Worklist.push_back(Pred); } + assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!"); + // We've found all the blocks participating in the loop, return our completed // set. return LoopBlockSet; @@ -1636,32 +1780,58 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { } while (!DomWorklist.empty()); } -/// Take an invariant branch that has been determined to be safe and worthwhile -/// to unswitch despite being non-trivial to do so and perform the unswitch. -/// -/// This directly updates the CFG to hoist the predicate out of the loop, and -/// clone the necessary parts of the loop to maintain behavior. -/// -/// It also updates both dominator tree and loopinfo based on the unswitching. -/// -/// Once unswitching has been performed it runs the provided callback to report -/// the new loops and no-longer valid loops to the caller. -static bool unswitchInvariantBranch( - Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI, - AssumptionCache &AC, - function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { - assert(BI.isConditional() && "Can only unswitch a conditional branch!"); - assert(L.isLoopInvariant(BI.getCondition()) && - "Can only unswitch an invariant branch condition!"); +static bool unswitchNontrivialInvariants( + Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants, + DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + auto *ParentBB = TI.getParent(); + BranchInst *BI = dyn_cast<BranchInst>(&TI); + SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI); + + // We can only unswitch switches, conditional branches with an invariant + // condition, or combining invariant conditions with an instruction. + assert((SI || BI->isConditional()) && + "Can only unswitch switches and conditional branch!"); + bool FullUnswitch = SI || BI->getCondition() == Invariants[0]; + if (FullUnswitch) + assert(Invariants.size() == 1 && + "Cannot have other invariants with full unswitching!"); + else + assert(isa<Instruction>(BI->getCondition()) && + "Partial unswitching requires an instruction as the condition!"); + + // Constant and BBs tracking the cloned and continuing successor. When we are + // unswitching the entire condition, this can just be trivially chosen to + // unswitch towards `true`. However, when we are unswitching a set of + // invariants combined with `and` or `or`, the combining operation determines + // the best direction to unswitch: we want to unswitch the direction that will + // collapse the branch. + bool Direction = true; + int ClonedSucc = 0; + if (!FullUnswitch) { + if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) { + assert(cast<Instruction>(BI->getCondition())->getOpcode() == + Instruction::And && + "Only `or` and `and` instructions can combine invariants being " + "unswitched."); + Direction = false; + ClonedSucc = 1; + } + } - // Constant and BBs tracking the cloned and continuing successor. - const int ClonedSucc = 0; - auto *ParentBB = BI.getParent(); - auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc); - auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc); + BasicBlock *RetainedSuccBB = + BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest(); + SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; + if (BI) + UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc)); + else + for (auto Case : SI->cases()) + if (Case.getCaseSuccessor() != RetainedSuccBB) + UnswitchedSuccBBs.insert(Case.getCaseSuccessor()); - assert(UnswitchedSuccBB != ContinueSuccBB && - "Should not unswitch a branch that always goes to the same place!"); + assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && + "Should not unswitch the same successor we are retaining!"); // The branch should be in this exact loop. Any inner loop's invariant branch // should be handled by unswitching that inner loop. The caller of this @@ -1680,9 +1850,6 @@ static bool unswitchInvariantBranch( if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) return false; - SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - // Compute the parent loop now before we start hacking on things. Loop *ParentL = L.getParentLoop(); @@ -1701,27 +1868,31 @@ static bool unswitchInvariantBranch( OuterExitL = NewOuterExitL; } - // If the edge we *aren't* cloning in the unswitch (the continuing edge) - // dominates its target, we can skip cloning the dominated region of the loop - // and its exits. We compute this as a set of nodes to be skipped. - SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks; - if (ContinueSuccBB->getUniquePredecessor() || - llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) { - return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB); - })) { - visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) { - SkippedLoopAndExitBlocks.insert(BB); - return true; - }); + // At this point, we're definitely going to unswitch something so invalidate + // any cached information in ScalarEvolution for the outer most loop + // containing an exit block and all nested loops. + if (SE) { + if (OuterExitL) + SE->forgetLoop(OuterExitL); + else + SE->forgetTopmostLoop(&L); } - // Similarly, if the edge we *are* cloning in the unswitch (the unswitched - // edge) dominates its target, we will end up with dead nodes in the original - // loop and its exits that will need to be deleted. Here, we just retain that - // the property holds and will compute the deleted set later. - bool DeleteUnswitchedSucc = - UnswitchedSuccBB->getUniquePredecessor() || - llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) { - return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB); + + // If the edge from this terminator to a successor dominates that successor, + // store a map from each block in its dominator subtree to it. This lets us + // tell when cloning for a particular successor if a block is dominated by + // some *other* successor with a single data structure. We use this to + // significantly reduce cloning. + SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; + for (auto *SuccBB : llvm::concat<BasicBlock *const>( + makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs)) + if (SuccBB->getUniquePredecessor() || + llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) { + return PredBB == ParentBB || DT.dominates(SuccBB, PredBB); + })) + visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) { + DominatingSucc[BB] = SuccBB; + return true; }); // Split the preheader, so that we know that there is a safe place to insert @@ -1732,52 +1903,162 @@ static bool unswitchInvariantBranch( BasicBlock *SplitBB = L.getLoopPreheader(); BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI); - // Keep a mapping for the cloned values. - ValueToValueMapTy VMap; + // Keep track of the dominator tree updates needed. + SmallVector<DominatorTree::UpdateType, 4> DTUpdates; + + // Clone the loop for each unswitched successor. + SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; + VMaps.reserve(UnswitchedSuccBBs.size()); + SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; + for (auto *SuccBB : UnswitchedSuccBBs) { + VMaps.emplace_back(new ValueToValueMapTy()); + ClonedPHs[SuccBB] = buildClonedLoopBlocks( + L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB, + DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI); + } - // Build the cloned blocks from the loop. - auto *ClonedPH = buildClonedLoopBlocks( - L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB, - ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI); + // The stitching of the branched code back together depends on whether we're + // doing full unswitching or not with the exception that we always want to + // nuke the initial terminator placed in the split block. + SplitBB->getTerminator()->eraseFromParent(); + if (FullUnswitch) { + // First we need to unhook the successor relationship as we'll be replacing + // the terminator with a direct branch. This is much simpler for branches + // than switches so we handle those first. + if (BI) { + // Remove the parent as a predecessor of the unswitched successor. + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); + UnswitchedSuccBB->removePredecessor(ParentBB, + /*DontDeleteUselessPHIs*/ true); + DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); + } else { + // Note that we actually want to remove the parent block as a predecessor + // of *every* case successor. The case successor is either unswitched, + // completely eliminating an edge from the parent to that successor, or it + // is a duplicate edge to the retained successor as the retained successor + // is always the default successor and as we'll replace this with a direct + // branch we no longer need the duplicate entries in the PHI nodes. + assert(SI->getDefaultDest() == RetainedSuccBB && + "Not retaining default successor!"); + for (auto &Case : SI->cases()) + Case.getCaseSuccessor()->removePredecessor( + ParentBB, + /*DontDeleteUselessPHIs*/ true); + + // We need to use the set to populate domtree updates as even when there + // are multiple cases pointing at the same successor we only want to + // remove and insert one edge in the domtree. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); + } - // Build the cloned loop structure itself. This may be substantially - // different from the original structure due to the simplified CFG. This also - // handles inserting all the cloned blocks into the correct loops. - SmallVector<Loop *, 4> NonChildClonedLoops; - Loop *ClonedL = - buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops); + // Now that we've unhooked the successor relationship, splice the terminator + // from the original loop to the split. + SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI); - // Remove the parent as a predecessor of the unswitched successor. - UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true); + // Now wire up the terminator to the preheaders. + if (BI) { + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + BI->setSuccessor(ClonedSucc, ClonedPH); + BI->setSuccessor(1 - ClonedSucc, LoopPH); + DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + } else { + assert(SI && "Must either be a branch or switch!"); + + // Walk the cases and directly update their successors. + SI->setDefaultDest(LoopPH); + for (auto &Case : SI->cases()) + if (Case.getCaseSuccessor() == RetainedSuccBB) + Case.setSuccessor(LoopPH); + else + Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second); + + // We need to use the set to populate domtree updates as even when there + // are multiple cases pointing at the same successor we only want to + // remove and insert one edge in the domtree. + for (BasicBlock *SuccBB : UnswitchedSuccBBs) + DTUpdates.push_back( + {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second}); + } - // Now splice the branch from the original loop and use it to select between - // the two loops. - SplitBB->getTerminator()->eraseFromParent(); - SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI); - BI.setSuccessor(ClonedSucc, ClonedPH); - BI.setSuccessor(1 - ClonedSucc, LoopPH); + // Create a new unconditional branch to the continuing block (as opposed to + // the one cloned). + BranchInst::Create(RetainedSuccBB, ParentBB); + } else { + assert(BI && "Only branches have partial unswitching."); + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + // When doing a partial unswitch, we have to do a bit more work to build up + // the branch in the split block. + buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, + *ClonedPH, *LoopPH); + DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + } - // Create a new unconditional branch to the continuing block (as opposed to - // the one cloned). - BranchInst::Create(ContinueSuccBB, ParentBB); + // Apply the updates accumulated above to get an up-to-date dominator tree. + DT.applyUpdates(DTUpdates); - // Delete anything that was made dead in the original loop due to - // unswitching. - if (DeleteUnswitchedSucc) - deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI); + // Now that we have an accurate dominator tree, first delete the dead cloned + // blocks so that we can accurately build any cloned loops. It is important to + // not delete the blocks from the original loop yet because we still want to + // reference the original loop to understand the cloned loop's structure. + deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT); + + // Build the cloned loop structure itself. This may be substantially + // different from the original structure due to the simplified CFG. This also + // handles inserting all the cloned blocks into the correct loops. + SmallVector<Loop *, 4> NonChildClonedLoops; + for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) + buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops); + // Now that our cloned loops have been built, we can update the original loop. + // First we delete the dead blocks from it and then we rebuild the loop + // structure taking these deletions into account. + deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI); SmallVector<Loop *, 4> HoistedLoops; bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops); - // This will have completely invalidated the dominator tree. We can't easily - // bound how much is invalid because in some cases we will refine the - // predecessor set of exit blocks of the loop which can move large unrelated - // regions of code into a new subtree. - // - // FIXME: Eventually, we should use an incremental update utility that - // leverages the existing information in the dominator tree (and potentially - // the nature of the change) to more efficiently update things. - DT.recalculate(*SplitBB->getParent()); + // This transformation has a high risk of corrupting the dominator tree, and + // the below steps to rebuild loop structures will result in hard to debug + // errors in that case so verify that the dominator tree is sane first. + // FIXME: Remove this when the bugs stop showing up and rely on existing + // verification steps. + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + + if (BI) { + // If we unswitched a branch which collapses the condition to a known + // constant we want to replace all the uses of the invariants within both + // the original and cloned blocks. We do this here so that we can use the + // now updated dominator tree to identify which side the users are on. + assert(UnswitchedSuccBBs.size() == 1 && + "Only one possible unswitched block for a branch!"); + BasicBlock *ClonedPH = ClonedPHs.begin()->second; + ConstantInt *UnswitchedReplacement = + Direction ? ConstantInt::getTrue(BI->getContext()) + : ConstantInt::getFalse(BI->getContext()); + ConstantInt *ContinueReplacement = + Direction ? ConstantInt::getFalse(BI->getContext()) + : ConstantInt::getTrue(BI->getContext()); + for (Value *Invariant : Invariants) + for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); + UI != UE;) { + // Grab the use and walk past it so we can clobber it in the use list. + Use *U = &*UI++; + Instruction *UserI = dyn_cast<Instruction>(U->getUser()); + if (!UserI) + continue; + + // Replace it with the 'continue' side if in the main loop body, and the + // unswitched if in the cloned blocks. + if (DT.dominates(LoopPH, UserI->getParent())) + U->set(ContinueReplacement); + else if (DT.dominates(ClonedPH, UserI->getParent())) + U->set(UnswitchedReplacement); + } + } // We can change which blocks are exit blocks of all the cloned sibling // loops, the current loop, and any parent loops which shared exit blocks @@ -1791,57 +2072,50 @@ static bool unswitchInvariantBranch( // also need to cover any intervening loops. We add all of these loops to // a list and sort them by loop depth to achieve this without updating // unnecessary loops. - auto UpdateLCSSA = [&](Loop &UpdateL) { + auto UpdateLoop = [&](Loop &UpdateL) { #ifndef NDEBUG - for (Loop *ChildL : UpdateL) + UpdateL.verifyLoop(); + for (Loop *ChildL : UpdateL) { + ChildL->verifyLoop(); assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && "Perturbed a child loop's LCSSA form!"); + } #endif + // First build LCSSA for this loop so that we can preserve it when + // forming dedicated exits. We don't want to perturb some other loop's + // LCSSA while doing that CFG edit. formLCSSA(UpdateL, DT, &LI, nullptr); + + // For loops reached by this loop's original exit blocks we may + // introduced new, non-dedicated exits. At least try to re-form dedicated + // exits for these loops. This may fail if they couldn't have dedicated + // exits to start with. + formDedicatedExitBlocks(&UpdateL, &DT, &LI, /*PreserveLCSSA*/ true); }; // For non-child cloned loops and hoisted loops, we just need to update LCSSA // and we can do it in any order as they don't nest relative to each other. - for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) - UpdateLCSSA(*UpdatedL); + // + // Also check if any of the loops we have updated have become top-level loops + // as that will necessitate widening the outer loop scope. + for (Loop *UpdatedL : + llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) { + UpdateLoop(*UpdatedL); + if (!UpdatedL->getParentLoop()) + OuterExitL = nullptr; + } + if (IsStillLoop) { + UpdateLoop(L); + if (!L.getParentLoop()) + OuterExitL = nullptr; + } // If the original loop had exit blocks, walk up through the outer most loop // of those exit blocks to update LCSSA and form updated dedicated exits. - if (OuterExitL != &L) { - SmallVector<Loop *, 4> OuterLoops; - // We start with the cloned loop and the current loop if they are loops and - // move toward OuterExitL. Also, if either the cloned loop or the current - // loop have become top level loops we need to walk all the way out. - if (ClonedL) { - OuterLoops.push_back(ClonedL); - if (!ClonedL->getParentLoop()) - OuterExitL = nullptr; - } - if (IsStillLoop) { - OuterLoops.push_back(&L); - if (!L.getParentLoop()) - OuterExitL = nullptr; - } - // Grab all of the enclosing loops now. + if (OuterExitL != &L) for (Loop *OuterL = ParentL; OuterL != OuterExitL; OuterL = OuterL->getParentLoop()) - OuterLoops.push_back(OuterL); - - // Finally, update our list of outer loops. This is nicely ordered to work - // inside-out. - for (Loop *OuterL : OuterLoops) { - // First build LCSSA for this loop so that we can preserve it when - // forming dedicated exits. We don't want to perturb some other loop's - // LCSSA while doing that CFG edit. - UpdateLCSSA(*OuterL); - - // For loops reached by this loop's original exit blocks we may - // introduced new, non-dedicated exits. At least try to re-form dedicated - // exits for these loops. This may fail if they couldn't have dedicated - // exits to start with. - formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true); - } - } + UpdateLoop(*OuterL); #ifndef NDEBUG // Verify the entire loop structure to catch any incorrect updates before we @@ -1856,7 +2130,7 @@ static bool unswitchInvariantBranch( for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) if (UpdatedL->getParentLoop() == ParentL) SibLoops.push_back(UpdatedL); - NonTrivialUnswitchCB(IsStillLoop, SibLoops); + UnswitchCB(IsStillLoop, SibLoops); ++NumBranches; return true; @@ -1895,50 +2169,69 @@ computeDomSubtreeCost(DomTreeNode &N, return Cost; } -/// Unswitch control flow predicated on loop invariant conditions. -/// -/// This first hoists all branches or switches which are trivial (IE, do not -/// require duplicating any part of the loop) out of the loop body. It then -/// looks at other loop invariant control flows and tries to unswitch those as -/// well by cloning the loop if the result is small enough. static bool -unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, - TargetTransformInfo &TTI, bool NonTrivial, - function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) { - assert(L.isRecursivelyLCSSAForm(DT, LI) && - "Loops must be in LCSSA form before unswitching."); - bool Changed = false; +unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, TargetTransformInfo &TTI, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + // Collect all invariant conditions within this loop (as opposed to an inner + // loop which would be handled when visiting that inner loop). + SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4> + UnswitchCandidates; + for (auto *BB : L.blocks()) { + if (LI.getLoopFor(BB) != &L) + continue; - // Must be in loop simplified form: we need a preheader and dedicated exits. - if (!L.isLoopSimplifyForm()) - return false; + if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { + // We can only consider fully loop-invariant switch conditions as we need + // to completely eliminate the switch after unswitching. + if (!isa<Constant>(SI->getCondition()) && + L.isLoopInvariant(SI->getCondition())) + UnswitchCandidates.push_back({SI, {SI->getCondition()}}); + continue; + } - // Try trivial unswitch first before loop over other basic blocks in the loop. - Changed |= unswitchAllTrivialConditions(L, DT, LI); + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) || + BI->getSuccessor(0) == BI->getSuccessor(1)) + continue; - // If we're not doing non-trivial unswitching, we're done. We both accept - // a parameter but also check a local flag that can be used for testing - // a debugging. - if (!NonTrivial && !EnableNonTrivialUnswitch) - return Changed; - - // Collect all remaining invariant branch conditions within this loop (as - // opposed to an inner loop which would be handled when visiting that inner - // loop). - SmallVector<TerminatorInst *, 4> UnswitchCandidates; - for (auto *BB : L.blocks()) - if (LI.getLoopFor(BB) == &L) - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) - if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) && - BI->getSuccessor(0) != BI->getSuccessor(1)) - UnswitchCandidates.push_back(BI); + if (L.isLoopInvariant(BI->getCondition())) { + UnswitchCandidates.push_back({BI, {BI->getCondition()}}); + continue; + } + + Instruction &CondI = *cast<Instruction>(BI->getCondition()); + if (CondI.getOpcode() != Instruction::And && + CondI.getOpcode() != Instruction::Or) + continue; + + TinyPtrVector<Value *> Invariants = + collectHomogenousInstGraphLoopInvariants(L, CondI, LI); + if (Invariants.empty()) + continue; + + UnswitchCandidates.push_back({BI, std::move(Invariants)}); + } // If we didn't find any candidates, we're done. if (UnswitchCandidates.empty()) - return Changed; + return false; - DEBUG(dbgs() << "Considering " << UnswitchCandidates.size() - << " non-trivial loop invariant conditions for unswitching.\n"); + // Check if there are irreducible CFG cycles in this loop. If so, we cannot + // easily unswitch non-trivial edges out of the loop. Doing so might turn the + // irreducible control flow into reducible control flow and introduce new + // loops "out of thin air". If we ever discover important use cases for doing + // this, we can add support to loop unswitch, but it is a lot of complexity + // for what seems little or no real world benefit. + LoopBlocksRPO RPOT(&L); + RPOT.perform(&LI); + if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI)) + return false; + + LLVM_DEBUG( + dbgs() << "Considering " << UnswitchCandidates.size() + << " non-trivial loop invariant conditions for unswitching.\n"); // Given that unswitching these terminators will require duplicating parts of // the loop, so we need to be able to model that cost. Compute the ephemeral @@ -1962,10 +2255,10 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, continue; if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) - return Changed; + return false; if (auto CS = CallSite(&I)) if (CS.isConvergent() || CS.cannotDuplicate()) - return Changed; + return false; Cost += TTI.getUserCost(&I); } @@ -1974,7 +2267,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, assert(LoopCost >= 0 && "Must not have negative loop costs!"); BBCostMap[BB] = Cost; } - DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); + LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n"); // Now we find the best candidate by searching for the one with the following // properties in order: @@ -1993,8 +2286,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, SmallDenseMap<DomTreeNode *, int, 4> DTCostMap; // Given a terminator which might be unswitched, computes the non-duplicated // cost for that terminator. - auto ComputeUnswitchedCost = [&](TerminatorInst *TI) { - BasicBlock &BB = *TI->getParent(); + auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) { + BasicBlock &BB = *TI.getParent(); SmallPtrSet<BasicBlock *, 4> Visited; int Cost = LoopCost; @@ -2003,6 +2296,26 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, if (!Visited.insert(SuccBB).second) continue; + // If this is a partial unswitch candidate, then it must be a conditional + // branch with a condition of either `or` or `and`. In that case, one of + // the successors is necessarily duplicated, so don't even try to remove + // its cost. + if (!FullUnswitch) { + auto &BI = cast<BranchInst>(TI); + if (cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::And) { + if (SuccBB == BI.getSuccessor(1)) + continue; + } else { + assert(cast<Instruction>(BI.getCondition())->getOpcode() == + Instruction::Or && + "Only `and` and `or` conditions can result in a partial " + "unswitch!"); + if (SuccBB == BI.getSuccessor(0)) + continue; + } + } + // This successor's domtree will not need to be duplicated after // unswitching if the edge to the successor dominates it (and thus the // entire tree). This essentially means there is no other path into this @@ -2026,27 +2339,95 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, }; TerminatorInst *BestUnswitchTI = nullptr; int BestUnswitchCost; - for (TerminatorInst *CandidateTI : UnswitchCandidates) { - int CandidateCost = ComputeUnswitchedCost(CandidateTI); - DEBUG(dbgs() << " Computed cost of " << CandidateCost - << " for unswitch candidate: " << *CandidateTI << "\n"); + ArrayRef<Value *> BestUnswitchInvariants; + for (auto &TerminatorAndInvariants : UnswitchCandidates) { + TerminatorInst &TI = *TerminatorAndInvariants.first; + ArrayRef<Value *> Invariants = TerminatorAndInvariants.second; + BranchInst *BI = dyn_cast<BranchInst>(&TI); + int CandidateCost = ComputeUnswitchedCost( + TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 && + Invariants[0] == BI->getCondition())); + LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost + << " for unswitch candidate: " << TI << "\n"); if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) { - BestUnswitchTI = CandidateTI; + BestUnswitchTI = &TI; BestUnswitchCost = CandidateCost; + BestUnswitchInvariants = Invariants; } } - if (BestUnswitchCost < UnswitchThreshold) { - DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = " - << BestUnswitchCost << ") branch: " << *BestUnswitchTI - << "\n"); - Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT, - LI, AC, NonTrivialUnswitchCB); - } else { - DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost - << "\n"); + if (BestUnswitchCost >= UnswitchThreshold) { + LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " + << BestUnswitchCost << "\n"); + return false; } + LLVM_DEBUG(dbgs() << " Trying to unswitch non-trivial (cost = " + << BestUnswitchCost << ") terminator: " << *BestUnswitchTI + << "\n"); + return unswitchNontrivialInvariants( + L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE); +} + +/// Unswitch control flow predicated on loop invariant conditions. +/// +/// This first hoists all branches or switches which are trivial (IE, do not +/// require duplicating any part of the loop) out of the loop body. It then +/// looks at other loop invariant control flows and tries to unswitch those as +/// well by cloning the loop if the result is small enough. +/// +/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also +/// updated based on the unswitch. +/// +/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is +/// true, we will attempt to do non-trivial unswitching as well as trivial +/// unswitching. +/// +/// The `UnswitchCB` callback provided will be run after unswitching is +/// complete, with the first parameter set to `true` if the provided loop +/// remains a loop, and a list of new sibling loops created. +/// +/// If `SE` is non-null, we will update that analysis based on the unswitching +/// done. +static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC, TargetTransformInfo &TTI, + bool NonTrivial, + function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB, + ScalarEvolution *SE) { + assert(L.isRecursivelyLCSSAForm(DT, LI) && + "Loops must be in LCSSA form before unswitching."); + bool Changed = false; + + // Must be in loop simplified form: we need a preheader and dedicated exits. + if (!L.isLoopSimplifyForm()) + return false; + + // Try trivial unswitch first before loop over other basic blocks in the loop. + if (unswitchAllTrivialConditions(L, DT, LI, SE)) { + // If we unswitched successfully we will want to clean up the loop before + // processing it further so just mark it as unswitched and return. + UnswitchCB(/*CurrentLoopValid*/ true, {}); + return true; + } + + // If we're not doing non-trivial unswitching, we're done. We both accept + // a parameter but also check a local flag that can be used for testing + // a debugging. + if (!NonTrivial && !EnableNonTrivialUnswitch) + return false; + + // For non-trivial unswitching, because it often creates new loops, we rely on + // the pass manager to iterate on the loops rather than trying to immediately + // reach a fixed point. There is no substantial advantage to iterating + // internally, and if any of the new loops are simplified enough to contain + // trivial unswitching we want to prefer those. + + // Try to unswitch the best invariant condition. We prefer this full unswitch to + // a partial unswitch when possible below the threshold. + if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE)) + return true; + + // No other opportunities to unswitch. return Changed; } @@ -2056,16 +2437,18 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, Function &F = *L.getHeader()->getParent(); (void)F; - DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); + LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L + << "\n"); // Save the current loop name in a variable so that we can report it even // after it has been deleted. std::string LoopName = L.getName(); - auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, - ArrayRef<Loop *> NewLoops) { + auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { // If we did a non-trivial unswitch, we have added new (cloned) loops. - U.addSiblingLoops(NewLoops); + if (!NewLoops.empty()) + U.addSiblingLoops(NewLoops); // If the current loop remains valid, we should revisit it to catch any // other unswitch opportunities. Otherwise, we need to mark it as deleted. @@ -2075,15 +2458,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, U.markLoopAsDeleted(L, LoopName); }; - if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, - NonTrivialUnswitchCB)) + if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB, + &AR.SE)) return PreservedAnalyses::all(); -#ifndef NDEBUG // Historically this pass has had issues with the dominator tree so verify it // in asserts builds. - AR.DT.verifyDomTree(); -#endif + assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); return getLoopPassPreservedAnalyses(); } @@ -2118,15 +2499,19 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { Function &F = *L->getHeader()->getParent(); - DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n"); + LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L + << "\n"); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid, - ArrayRef<Loop *> NewLoops) { + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + auto *SE = SEWP ? &SEWP->getSE() : nullptr; + + auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid, + ArrayRef<Loop *> NewLoops) { // If we did a non-trivial unswitch, we have added new (cloned) loops. for (auto *NewL : NewLoops) LPM.addLoop(*NewL); @@ -2140,18 +2525,16 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LPM.markLoopAsDeleted(*L); }; - bool Changed = - unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB); + bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE); // If anything was unswitched, also clear any cached information about this // loop. LPM.deleteSimpleAnalysisLoop(L); -#ifndef NDEBUG // Historically this pass has had issues with the dominator tree so verify it // in asserts builds. - DT.verifyDomTree(); -#endif + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + return Changed; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 1522170dc3b9..b7b1db76b492 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -39,7 +40,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" -#include "llvm/Transforms/Utils/Local.h" #include <utility> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index cfb8a062299f..ca6b93e0b4a9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (isa<LoadInst>(Inst)) + if (Inst->mayReadFromMemory()) return false; // We don't want to sink across a critical edge if we don't dominate the @@ -187,11 +187,9 @@ static bool SinkInstruction(Instruction *Inst, if (!SuccToSinkTo) return false; - DEBUG(dbgs() << "Sink" << *Inst << " ("; - Inst->getParent()->printAsOperand(dbgs(), false); - dbgs() << " -> "; - SuccToSinkTo->printAsOperand(dbgs(), false); - dbgs() << ")\n"); + LLVM_DEBUG(dbgs() << "Sink" << *Inst << " ("; + Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> "; + SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n"); // Move the instruction. Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); @@ -244,7 +242,7 @@ static bool iterativelySinkInstructions(Function &F, DominatorTree &DT, do { MadeChange = false; - DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); + LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); // Process all basic blocks. for (BasicBlock &I : F) MadeChange |= ProcessBlock(I, DT, LI, AA); diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp index 23156d5a4d83..6743e19a7c92 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -64,7 +64,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, // block. We should consider using actual post-dominance here in the // future. if (UI->getParent() != PhiBB) { - DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n"); + LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n"); return false; } @@ -75,7 +75,7 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, // probably change this to do at least a limited scan of the intervening // instructions and allow handling stores in easily proven safe cases. if (mayBeMemoryDependent(*UI)) { - DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n"); + LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n"); return false; } @@ -126,8 +126,8 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, // If when we directly test whether this is safe it fails, bail. if (UnsafeSet.count(OpI) || ParentBB != PhiBB || mayBeMemoryDependent(*OpI)) { - DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI - << "\n"); + LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " + << *OpI << "\n"); // Record the stack of instructions which reach this node as unsafe // so we prune subsequent searches. UnsafeSet.insert(OpI); @@ -229,7 +229,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( NonFreeMat |= MatCost != TTI.TCC_Free; } if (!NonFreeMat) { - DEBUG(dbgs() << " Free: " << PN << "\n"); + LLVM_DEBUG(dbgs() << " Free: " << PN << "\n"); // No profit in free materialization. return false; } @@ -237,7 +237,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( // Now check that the uses of this PHI can actually be speculated, // otherwise we'll still have to materialize the PHI value. if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) { - DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n"); + LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n"); return false; } @@ -266,7 +266,7 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( // Assume we will commute the constant to the RHS to be canonical. Idx = 1; - // Get the intrinsic ID if this user is an instrinsic. + // Get the intrinsic ID if this user is an intrinsic. Intrinsic::ID IID = Intrinsic::not_intrinsic; if (auto *UserII = dyn_cast<IntrinsicInst>(UserI)) IID = UserII->getIntrinsicID(); @@ -288,9 +288,13 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( // just bail. We're only interested in cases where folding the incoming // constants is at least break-even on all paths. if (FoldedCost > MatCost) { - DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n" - " Materializing cost: " << MatCost << "\n" - " Accumulated folded cost: " << FoldedCost << "\n"); + LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC + << "\n" + " Materializing cost: " + << MatCost + << "\n" + " Accumulated folded cost: " + << FoldedCost << "\n"); return false; } } @@ -310,8 +314,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( "less that its materialized cost, " "the sum must be as well."); - DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost) - << ": " << PN << "\n"); + LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost) + << ": " << PN << "\n"); CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost; return true; } @@ -489,9 +493,13 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs, // and zero out the cost of everything it depends on. int CostSavings = CostSavingsMap.find(PN)->second; if (SpecCost > CostSavings) { - DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n" - " Cost savings: " << CostSavings << "\n" - " Speculation cost: " << SpecCost << "\n"); + LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN + << "\n" + " Cost savings: " + << CostSavings + << "\n" + " Speculation cost: " + << SpecCost << "\n"); continue; } @@ -545,7 +553,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, SmallPtrSetImpl<Instruction *> &PotentialSpecSet, SmallSetVector<BasicBlock *, 16> &PredSet, DominatorTree &DT) { - DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n"); + LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n"); NumPHIsSpeculated += SpecPNs.size(); // Split any critical edges so that we have a block to hoist into. @@ -558,8 +566,8 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges()); if (NewPredBB) { ++NumEdgesSplit; - DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName() - << "\n"); + LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName() + << "\n"); SpecPreds.push_back(NewPredBB); } else { assert(PredBB->getSingleSuccessor() == ParentBB && @@ -593,14 +601,15 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, int NumSpecInsts = SpecList.size() * SpecPreds.size(); int NumRedundantInsts = NumSpecInsts - SpecList.size(); - DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, " - << NumRedundantInsts << " redundancies\n"); + LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts + << " speculated instructions, " << NumRedundantInsts + << " redundancies\n"); NumSpeculatedInstructions += NumSpecInsts; NumNewRedundantInstructions += NumRedundantInsts; // Each predecessor is numbered by its index in `SpecPreds`, so for each // instruction we speculate, the speculated instruction is stored in that - // index of the vector asosciated with the original instruction. We also + // index of the vector associated with the original instruction. We also // store the incoming values for each predecessor from any PHIs used. SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap; @@ -716,7 +725,7 @@ static void speculatePHIs(ArrayRef<PHINode *> SpecPNs, /// true when at least some speculation occurs. static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, DominatorTree &DT, TargetTransformInfo &TTI) { - DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n"); + LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n"); // Savings in cost from speculating around a PHI node. SmallDenseMap<PHINode *, int, 16> CostSavingsMap; @@ -745,7 +754,7 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, PNs.end()); // If no PHIs were profitable, skip. if (PNs.empty()) { - DEBUG(dbgs() << " No safe and profitable PHIs found!\n"); + LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n"); return false; } @@ -763,13 +772,13 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, // differently. if (isa<IndirectBrInst>(PredBB->getTerminator()) || isa<InvokeInst>(PredBB->getTerminator())) { - DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName() - << "\n"); + LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: " + << PredBB->getName() << "\n"); return false; } } if (PredSet.size() < 2) { - DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n"); + LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n"); return false; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index a7c308b59877..f5e1dd6ed850 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -62,7 +62,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/SpeculativeExecution.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" @@ -137,6 +137,7 @@ INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution", void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.setPreservesCFG(); } bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) { @@ -151,8 +152,8 @@ namespace llvm { bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) { if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) { - DEBUG(dbgs() << "Not running SpeculativeExecution because " - "TTI->hasBranchDivergence() is false.\n"); + LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because " + "TTI->hasBranchDivergence() is false.\n"); return false; } @@ -251,7 +252,7 @@ static unsigned ComputeSpeculationCost(const Instruction *I, bool SpeculativeExecutionPass::considerHoistingFromTo( BasicBlock &FromBlock, BasicBlock &ToBlock) { - SmallSet<const Instruction *, 8> NotHoisted; + SmallPtrSet<const Instruction *, 8> NotHoisted; const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) { for (Value* V : U->operand_values()) { if (Instruction *I = dyn_cast<Instruction>(V)) { @@ -314,6 +315,7 @@ PreservedAnalyses SpeculativeExecutionPass::run(Function &F, return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<GlobalsAA>(); + PA.preserveSet<CFGAnalyses>(); return PA; } } // namespace llvm diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index ce40af1223f6..2061db13639a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -61,6 +61,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -80,7 +81,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <limits> diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index b8fb80b6cc26..d650264176aa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> #include <cassert> @@ -55,6 +56,12 @@ static const char *const FlowBlockName = "Flow"; namespace { +static cl::opt<bool> ForceSkipUniformRegions( + "structurizecfg-skip-uniform-regions", + cl::Hidden, + cl::desc("Force whether the StructurizeCFG pass skips uniform regions"), + cl::init(false)); + // Definition of the complex types used in this pass. using BBValuePair = std::pair<BasicBlock *, Value *>; @@ -120,7 +127,7 @@ public: bool resultIsRememberedBlock() { return ResultIsRemembered; } }; -/// @brief Transforms the control flow graph on one single entry/exit region +/// Transforms the control flow graph on one single entry/exit region /// at a time. /// /// After the transform all "If"/"Then"/"Else" style control flow looks like @@ -176,6 +183,7 @@ class StructurizeCFG : public RegionPass { Function *Func; Region *ParentRegion; + DivergenceAnalysis *DA; DominatorTree *DT; LoopInfo *LI; @@ -196,6 +204,9 @@ class StructurizeCFG : public RegionPass { void orderNodes(); + Loop *getAdjustedLoop(RegionNode *RN); + unsigned getAdjustedLoopDepth(RegionNode *RN); + void analyzeLoops(RegionNode *N); Value *invert(Value *Condition); @@ -242,8 +253,11 @@ class StructurizeCFG : public RegionPass { public: static char ID; - explicit StructurizeCFG(bool SkipUniformRegions = false) - : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) { + explicit StructurizeCFG(bool SkipUniformRegions_ = false) + : RegionPass(ID), + SkipUniformRegions(SkipUniformRegions_) { + if (ForceSkipUniformRegions.getNumOccurrences()) + SkipUniformRegions = ForceSkipUniformRegions.getValue(); initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); } @@ -278,7 +292,7 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) -/// \brief Initialize the types and constants used in the pass +/// Initialize the types and constants used in the pass bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { LLVMContext &Context = R->getEntry()->getContext(); @@ -290,7 +304,27 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { return false; } -/// \brief Build up the general order of nodes +/// Use the exit block to determine the loop if RN is a SubRegion. +Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) { + if (RN->isSubRegion()) { + Region *SubRegion = RN->getNodeAs<Region>(); + return LI->getLoopFor(SubRegion->getExit()); + } + + return LI->getLoopFor(RN->getEntry()); +} + +/// Use the exit block to determine the loop depth if RN is a SubRegion. +unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) { + if (RN->isSubRegion()) { + Region *SubR = RN->getNodeAs<Region>(); + return LI->getLoopDepth(SubR->getExit()); + } + + return LI->getLoopDepth(RN->getEntry()); +} + +/// Build up the general order of nodes void StructurizeCFG::orderNodes() { ReversePostOrderTraversal<Region*> RPOT(ParentRegion); SmallDenseMap<Loop*, unsigned, 8> LoopBlocks; @@ -299,16 +333,15 @@ void StructurizeCFG::orderNodes() { // to what we want. The only problem with it is that sometimes backedges // for outer loops will be visited before backedges for inner loops. for (RegionNode *RN : RPOT) { - BasicBlock *BB = RN->getEntry(); - Loop *Loop = LI->getLoopFor(BB); + Loop *Loop = getAdjustedLoop(RN); ++LoopBlocks[Loop]; } unsigned CurrentLoopDepth = 0; Loop *CurrentLoop = nullptr; for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { - BasicBlock *BB = (*I)->getEntry(); - unsigned LoopDepth = LI->getLoopDepth(BB); + RegionNode *RN = cast<RegionNode>(*I); + unsigned LoopDepth = getAdjustedLoopDepth(RN); if (is_contained(Order, *I)) continue; @@ -320,15 +353,14 @@ void StructurizeCFG::orderNodes() { auto LoopI = I; while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) { LoopI++; - BasicBlock *LoopBB = (*LoopI)->getEntry(); - if (LI->getLoopFor(LoopBB) == CurrentLoop) { + if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) { --BlockCount; Order.push_back(*LoopI); } } } - CurrentLoop = LI->getLoopFor(BB); + CurrentLoop = getAdjustedLoop(RN); if (CurrentLoop) LoopBlocks[CurrentLoop]--; @@ -343,7 +375,7 @@ void StructurizeCFG::orderNodes() { std::reverse(Order.begin(), Order.end()); } -/// \brief Determine the end of the loops +/// Determine the end of the loops void StructurizeCFG::analyzeLoops(RegionNode *N) { if (N->isSubRegion()) { // Test for exit as back edge @@ -362,15 +394,16 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } } -/// \brief Invert the given condition +/// Invert the given condition Value *StructurizeCFG::invert(Value *Condition) { // First: Check if it's a constant if (Constant *C = dyn_cast<Constant>(Condition)) return ConstantExpr::getNot(C); // Second: If the condition is already inverted, return the original value - if (match(Condition, m_Not(m_Value(Condition)))) - return Condition; + Value *NotCondition; + if (match(Condition, m_Not(m_Value(NotCondition)))) + return NotCondition; if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { // Third: Check all the users for an invert @@ -394,7 +427,7 @@ Value *StructurizeCFG::invert(Value *Condition) { llvm_unreachable("Unhandled condition to invert"); } -/// \brief Build the condition for one edge +/// Build the condition for one edge Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, bool Invert) { Value *Cond = Invert ? BoolFalse : BoolTrue; @@ -407,7 +440,7 @@ Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, return Cond; } -/// \brief Analyze the predecessors of each block and build up predicates +/// Analyze the predecessors of each block and build up predicates void StructurizeCFG::gatherPredicates(RegionNode *N) { RegionInfo *RI = ParentRegion->getRegionInfo(); BasicBlock *BB = N->getEntry(); @@ -465,7 +498,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { } } -/// \brief Collect various loop and predicate infos +/// Collect various loop and predicate infos void StructurizeCFG::collectInfos() { // Reset predicate Predicates.clear(); @@ -478,10 +511,10 @@ void StructurizeCFG::collectInfos() { Visited.clear(); for (RegionNode *RN : reverse(Order)) { - DEBUG(dbgs() << "Visiting: " - << (RN->isSubRegion() ? "SubRegion with entry: " : "") - << RN->getEntry()->getName() << " Loop Depth: " - << LI->getLoopDepth(RN->getEntry()) << "\n"); + LLVM_DEBUG(dbgs() << "Visiting: " + << (RN->isSubRegion() ? "SubRegion with entry: " : "") + << RN->getEntry()->getName() << " Loop Depth: " + << LI->getLoopDepth(RN->getEntry()) << "\n"); // Analyze all the conditions leading to a node gatherPredicates(RN); @@ -494,7 +527,7 @@ void StructurizeCFG::collectInfos() { } } -/// \brief Insert the missing branch conditions +/// Insert the missing branch conditions void StructurizeCFG::insertConditions(bool Loops) { BranchVector &Conds = Loops ? LoopConds : Conditions; Value *Default = Loops ? BoolTrue : BoolFalse; @@ -540,7 +573,7 @@ void StructurizeCFG::insertConditions(bool Loops) { } } -/// \brief Remove all PHI values coming from "From" into "To" and remember +/// Remove all PHI values coming from "From" into "To" and remember /// them in DeletedPhis void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { PhiMap &Map = DeletedPhis[To]; @@ -552,7 +585,7 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { } } -/// \brief Add a dummy PHI value as soon as we knew the new predecessor +/// Add a dummy PHI value as soon as we knew the new predecessor void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { for (PHINode &Phi : To->phis()) { Value *Undef = UndefValue::get(Phi.getType()); @@ -561,7 +594,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { AddedPhis[To].push_back(From); } -/// \brief Add the real PHI value as soon as everything is set up +/// Add the real PHI value as soon as everything is set up void StructurizeCFG::setPhiValues() { SSAUpdater Updater; for (const auto &AddedPhi : AddedPhis) { @@ -601,7 +634,7 @@ void StructurizeCFG::setPhiValues() { assert(DeletedPhis.empty()); } -/// \brief Remove phi values from all successors and then remove the terminator. +/// Remove phi values from all successors and then remove the terminator. void StructurizeCFG::killTerminator(BasicBlock *BB) { TerminatorInst *Term = BB->getTerminator(); if (!Term) @@ -611,10 +644,12 @@ void StructurizeCFG::killTerminator(BasicBlock *BB) { SI != SE; ++SI) delPhiValues(BB, *SI); + if (DA) + DA->removeValue(Term); Term->eraseFromParent(); } -/// \brief Let node exit(s) point to NewExit +/// Let node exit(s) point to NewExit void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, bool IncludeDominator) { if (Node->isSubRegion()) { @@ -660,7 +695,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, } } -/// \brief Create a new flow node and update dominator tree and region info +/// Create a new flow node and update dominator tree and region info BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { LLVMContext &Context = Func->getContext(); BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : @@ -672,7 +707,7 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { return Flow; } -/// \brief Create a new or reuse the previous node as flow node +/// Create a new or reuse the previous node as flow node BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { BasicBlock *Entry = PrevNode->getEntry(); @@ -691,7 +726,7 @@ BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { return Flow; } -/// \brief Returns the region exit if possible, otherwise just a new flow node +/// Returns the region exit if possible, otherwise just a new flow node BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, bool ExitUseAllowed) { if (!Order.empty() || !ExitUseAllowed) @@ -703,13 +738,13 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, return Exit; } -/// \brief Set the previous node +/// Set the previous node void StructurizeCFG::setPrevNode(BasicBlock *BB) { PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : nullptr; } -/// \brief Does BB dominate all the predicates of Node? +/// Does BB dominate all the predicates of Node? bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { BBPredicates &Preds = Predicates[Node->getEntry()]; return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) { @@ -717,7 +752,7 @@ bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { }); } -/// \brief Can we predict that this node will always be called? +/// Can we predict that this node will always be called? bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { BBPredicates &Preds = Predicates[Node->getEntry()]; bool Dominated = false; @@ -845,7 +880,7 @@ void StructurizeCFG::createFlow() { } /// Handle a rare case where the disintegrated nodes instructions -/// no longer dominate all their uses. Not sure if this is really nessasary +/// no longer dominate all their uses. Not sure if this is really necessary void StructurizeCFG::rebuildSSA() { SSAUpdater Updater; for (BasicBlock *BB : ParentRegion->blocks()) @@ -878,30 +913,60 @@ void StructurizeCFG::rebuildSSA() { } } -static bool hasOnlyUniformBranches(const Region *R, +static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, const DivergenceAnalysis &DA) { - for (const BasicBlock *BB : R->blocks()) { - const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator()); - if (!Br || !Br->isConditional()) - continue; + for (auto E : R->elements()) { + if (!E->isSubRegion()) { + auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator()); + if (!Br || !Br->isConditional()) + continue; - if (!DA.isUniform(Br->getCondition())) - return false; - DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n"); + if (!DA.isUniform(Br)) + return false; + LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName() + << " has uniform terminator\n"); + } else { + // Explicitly refuse to treat regions as uniform if they have non-uniform + // subregions. We cannot rely on DivergenceAnalysis for branches in + // subregions because those branches may have been removed and re-created, + // so we look for our metadata instead. + // + // Warning: It would be nice to treat regions as uniform based only on + // their direct child basic blocks' terminators, regardless of whether + // subregions are uniform or not. However, this requires a very careful + // look at SIAnnotateControlFlow to make sure nothing breaks there. + for (auto BB : E->getNodeAs<Region>()->blocks()) { + auto Br = dyn_cast<BranchInst>(BB->getTerminator()); + if (!Br || !Br->isConditional()) + continue; + + if (!Br->getMetadata(UniformMDKindID)) + return false; + } + } } return true; } -/// \brief Run the transformation for each region found +/// Run the transformation for each region found bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { if (R->isTopLevelRegion()) return false; + DA = nullptr; + if (SkipUniformRegions) { // TODO: We could probably be smarter here with how we handle sub-regions. - auto &DA = getAnalysis<DivergenceAnalysis>(); - if (hasOnlyUniformBranches(R, DA)) { - DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n'); + // We currently rely on the fact that metadata is set by earlier invocations + // of the pass on sub-regions, and that this metadata doesn't get lost -- + // but we shouldn't rely on metadata for correctness! + unsigned UniformMDKindID = + R->getEntry()->getContext().getMDKindID("structurizecfg.uniform"); + DA = &getAnalysis<DivergenceAnalysis>(); + + if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) { + LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R + << '\n'); // Mark all direct child block terminators as having been treated as // uniform. To account for a possible future in which non-uniform @@ -913,7 +978,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { continue; if (Instruction *Term = E->getEntry()->getTerminator()) - Term->setMetadata("structurizecfg.uniform", MD); + Term->setMetadata(UniformMDKindID, MD); } return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 2a1106b41de2..f8cd6c17a5a6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -87,7 +87,7 @@ STATISTIC(NumEliminated, "Number of tail calls removed"); STATISTIC(NumRetDuped, "Number of return duplicated"); STATISTIC(NumAccumAdded, "Number of accumulators introduced"); -/// \brief Scan the specified function for alloca instructions. +/// Scan the specified function for alloca instructions. /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { // Because of PR962, we don't TRE dynamic allocas. @@ -302,7 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls, if (Visited[CI->getParent()] != ESCAPED) { // If the escape point was part way through the block, calls after the // escape point wouldn't have been put into DeferredTails. - DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); + LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); CI->setTailCall(); Modified = true; } else { @@ -699,8 +699,8 @@ static bool foldReturnAndProcessPred( BranchInst *BI = UncondBranchPreds.pop_back_val(); BasicBlock *Pred = BI->getParent(); if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){ - DEBUG(dbgs() << "FOLDING: " << *BB - << "INTO UNCOND BRANCH PRED: " << *Pred); + LLVM_DEBUG(dbgs() << "FOLDING: " << *BB + << "INTO UNCOND BRANCH PRED: " << *Pred); ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); // Cleanup: if all predecessors of BB have been eliminated by |