diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp | 900 |
1 files changed, 900 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp new file mode 100644 index 000000000000..9e7cccc88412 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -0,0 +1,900 @@ +//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file transforms calls of the current function (self recursion) followed +// by a return instruction with a branch to the entry of the function, creating +// a loop. This pass also implements the following extensions to the basic +// algorithm: +// +// 1. Trivial instructions between the call and return do not prevent the +// transformation from taking place, though currently the analysis cannot +// support moving any really useful instructions (only dead ones). +// 2. This pass transforms functions that are prevented from being tail +// recursive by an associative and commutative expression to use an +// accumulator variable, thus compiling the typical naive factorial or +// 'fib' implementation into efficient code. +// 3. TRE is performed if the function returns void, if the return +// returns the result returned by the call, or if the function returns a +// run-time constant on all exits from the function. It is possible, though +// unlikely, that the return returns something else (like constant 0), and +// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in +// the function return the exact same value. +// 4. If it can prove that callees do not access their caller stack frame, +// they are marked as eligible for tail call elimination (by the code +// generator). +// +// There are several improvements that could be made: +// +// 1. If the function has any alloca instructions, these instructions will be +// moved out of the entry block of the function, causing them to be +// evaluated each time through the tail recursion. Safely keeping allocas +// in the entry block requires analysis to proves that the tail-called +// function does not read or write the stack object. +// 2. Tail recursion is only performed if the call immediately precedes the +// return instruction. It's possible that there could be a jump between +// the call and the return. +// 3. There can be intervening operations between the call and the return that +// prevent the TRE from occurring. For example, there could be GEP's and +// stores to memory that will not be read or written by the call. This +// requires some substantial analysis (such as with DSA) to prove safe to +// move ahead of the call, but doing so could allow many more TREs to be +// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark. +// 4. The algorithm we use to detect if callees access their caller stack +// frames is very primitive. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "tailcallelim" + +STATISTIC(NumEliminated, "Number of tail calls removed"); +STATISTIC(NumRetDuped, "Number of return duplicated"); +STATISTIC(NumAccumAdded, "Number of accumulators introduced"); + +/// Scan the specified function for alloca instructions. +/// If it contains any dynamic allocas, returns false. +static bool canTRE(Function &F) { + // FIXME: The code generator produces really bad code when an 'escaping + // alloca' is changed from being a static alloca to being a dynamic alloca. + // Until this is resolved, disable this transformation if that would ever + // happen. This bug is PR962. + return llvm::all_of(instructions(F), [](Instruction &I) { + auto *AI = dyn_cast<AllocaInst>(&I); + return !AI || AI->isStaticAlloca(); + }); +} + +namespace { +struct AllocaDerivedValueTracker { + // Start at a root value and walk its use-def chain to mark calls that use the + // value or a derived value in AllocaUsers, and places where it may escape in + // EscapePoints. + void walk(Value *Root) { + SmallVector<Use *, 32> Worklist; + SmallPtrSet<Use *, 32> Visited; + + auto AddUsesToWorklist = [&](Value *V) { + for (auto &U : V->uses()) { + if (!Visited.insert(&U).second) + continue; + Worklist.push_back(&U); + } + }; + + AddUsesToWorklist(Root); + + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + + switch (I->getOpcode()) { + case Instruction::Call: + case Instruction::Invoke: { + auto &CB = cast<CallBase>(*I); + // If the alloca-derived argument is passed byval it is not an escape + // point, or a use of an alloca. Calling with byval copies the contents + // of the alloca into argument registers or stack slots, which exist + // beyond the lifetime of the current frame. + if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U))) + continue; + bool IsNocapture = + CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U)); + callUsesLocalStack(CB, IsNocapture); + if (IsNocapture) { + // If the alloca-derived argument is passed in as nocapture, then it + // can't propagate to the call's return. That would be capturing. + continue; + } + break; + } + case Instruction::Load: { + // The result of a load is not alloca-derived (unless an alloca has + // otherwise escaped, but this is a local analysis). + continue; + } + case Instruction::Store: { + if (U->getOperandNo() == 0) + EscapePoints.insert(I); + continue; // Stores have no users to analyze. + } + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + break; + default: + EscapePoints.insert(I); + break; + } + + AddUsesToWorklist(I); + } + } + + void callUsesLocalStack(CallBase &CB, bool IsNocapture) { + // Add it to the list of alloca users. + AllocaUsers.insert(&CB); + + // If it's nocapture then it can't capture this alloca. + if (IsNocapture) + return; + + // If it can write to memory, it can leak the alloca value. + if (!CB.onlyReadsMemory()) + EscapePoints.insert(&CB); + } + + SmallPtrSet<Instruction *, 32> AllocaUsers; + SmallPtrSet<Instruction *, 32> EscapePoints; +}; +} + +static bool markTails(Function &F, bool &AllCallsAreTailCalls, + OptimizationRemarkEmitter *ORE) { + if (F.callsFunctionThatReturnsTwice()) + return false; + AllCallsAreTailCalls = true; + + // The local stack holds all alloca instructions and all byval arguments. + AllocaDerivedValueTracker Tracker; + for (Argument &Arg : F.args()) { + if (Arg.hasByValAttr()) + Tracker.walk(&Arg); + } + for (auto &BB : F) { + for (auto &I : BB) + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Tracker.walk(AI); + } + + bool Modified = false; + + // Track whether a block is reachable after an alloca has escaped. Blocks that + // contain the escaping instruction will be marked as being visited without an + // escaped alloca, since that is how the block began. + enum VisitType { + UNVISITED, + UNESCAPED, + ESCAPED + }; + DenseMap<BasicBlock *, VisitType> Visited; + + // We propagate the fact that an alloca has escaped from block to successor. + // Visit the blocks that are propagating the escapedness first. To do this, we + // maintain two worklists. + SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped; + + // We may enter a block and visit it thinking that no alloca has escaped yet, + // then see an escape point and go back around a loop edge and come back to + // the same block twice. Because of this, we defer setting tail on calls when + // we first encounter them in a block. Every entry in this list does not + // statically use an alloca via use-def chain analysis, but may find an alloca + // through other means if the block turns out to be reachable after an escape + // point. + SmallVector<CallInst *, 32> DeferredTails; + + BasicBlock *BB = &F.getEntryBlock(); + VisitType Escaped = UNESCAPED; + do { + for (auto &I : *BB) { + if (Tracker.EscapePoints.count(&I)) + Escaped = ESCAPED; + + CallInst *CI = dyn_cast<CallInst>(&I); + // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is + // considered accessing memory and will be marked as a tail call if we + // don't bail out here. + if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) || + isa<PseudoProbeInst>(&I)) + continue; + + bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles(); + + if (!IsNoTail && CI->doesNotAccessMemory()) { + // A call to a readnone function whose arguments are all things computed + // outside this function can be marked tail. Even if you stored the + // alloca address into a global, a readnone function can't load the + // global anyhow. + // + // Note that this runs whether we know an alloca has escaped or not. If + // it has, then we can't trust Tracker.AllocaUsers to be accurate. + bool SafeToTail = true; + for (auto &Arg : CI->arg_operands()) { + if (isa<Constant>(Arg.getUser())) + continue; + if (Argument *A = dyn_cast<Argument>(Arg.getUser())) + if (!A->hasByValAttr()) + continue; + SafeToTail = false; + break; + } + if (SafeToTail) { + using namespace ore; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI) + << "marked as tail call candidate (readnone)"; + }); + CI->setTailCall(); + Modified = true; + continue; + } + } + + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + DeferredTails.push_back(CI); + } else { + AllCallsAreTailCalls = false; + } + } + + for (auto *SuccBB : successors(BB)) { + auto &State = Visited[SuccBB]; + if (State < Escaped) { + State = Escaped; + if (State == ESCAPED) + WorklistEscaped.push_back(SuccBB); + else + WorklistUnescaped.push_back(SuccBB); + } + } + + if (!WorklistEscaped.empty()) { + BB = WorklistEscaped.pop_back_val(); + Escaped = ESCAPED; + } else { + BB = nullptr; + while (!WorklistUnescaped.empty()) { + auto *NextBB = WorklistUnescaped.pop_back_val(); + if (Visited[NextBB] == UNESCAPED) { + BB = NextBB; + Escaped = UNESCAPED; + break; + } + } + } + } while (BB); + + for (CallInst *CI : DeferredTails) { + if (Visited[CI->getParent()] != ESCAPED) { + // If the escape point was part way through the block, calls after the + // escape point wouldn't have been put into DeferredTails. + LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); + CI->setTailCall(); + Modified = true; + } else { + AllCallsAreTailCalls = false; + } + } + + return Modified; +} + +/// Return true if it is safe to move the specified +/// instruction from after the call to before the call, assuming that all +/// instructions between the call and this instruction are movable. +/// +static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { + // FIXME: We can move load/store/call/free instructions above the call if the + // call does not mod/ref the memory location being processed. + if (I->mayHaveSideEffects()) // This also handles volatile loads. + return false; + + if (LoadInst *L = dyn_cast<LoadInst>(I)) { + // Loads may always be moved above calls without side effects. + if (CI->mayHaveSideEffects()) { + // Non-volatile loads may be moved above a call with side effects if it + // does not write to memory and the load provably won't trap. + // Writes to memory only matter if they may alias the pointer + // being loaded from. + const DataLayout &DL = L->getModule()->getDataLayout(); + if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || + !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(), + L->getAlign(), DL, L)) + return false; + } + } + + // Otherwise, if this is a side-effect free instruction, check to make sure + // that it does not use the return value of the call. If it doesn't use the + // return value of the call, it must only use things that are defined before + // the call, or movable instructions between the call and the instruction + // itself. + return !is_contained(I->operands(), CI); +} + +static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { + if (!I->isAssociative() || !I->isCommutative()) + return false; + + assert(I->getNumOperands() == 2 && + "Associative/commutative operations should have 2 args!"); + + // Exactly one operand should be the result of the call instruction. + if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || + (I->getOperand(0) != CI && I->getOperand(1) != CI)) + return false; + + // The only user of this instruction we allow is a single return instruction. + if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back())) + return false; + + return true; +} + +static Instruction *firstNonDbg(BasicBlock::iterator I) { + while (isa<DbgInfoIntrinsic>(I)) + ++I; + return &*I; +} + +namespace { +class TailRecursionEliminator { + Function &F; + const TargetTransformInfo *TTI; + AliasAnalysis *AA; + OptimizationRemarkEmitter *ORE; + DomTreeUpdater &DTU; + + // The below are shared state we want to have available when eliminating any + // calls in the function. There values should be populated by + // createTailRecurseLoopHeader the first time we find a call we can eliminate. + BasicBlock *HeaderBB = nullptr; + SmallVector<PHINode *, 8> ArgumentPHIs; + bool RemovableCallsMustBeMarkedTail = false; + + // PHI node to store our return value. + PHINode *RetPN = nullptr; + + // i1 PHI node to track if we have a valid return value stored in RetPN. + PHINode *RetKnownPN = nullptr; + + // Vector of select instructions we insereted. These selects use RetKnownPN + // to either propagate RetPN or select a new return value. + SmallVector<SelectInst *, 8> RetSelects; + + // The below are shared state needed when performing accumulator recursion. + // There values should be populated by insertAccumulator the first time we + // find an elimination that requires an accumulator. + + // PHI node to store our current accumulated value. + PHINode *AccPN = nullptr; + + // The instruction doing the accumulating. + Instruction *AccumulatorRecursionInstr = nullptr; + + TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI, + AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, + DomTreeUpdater &DTU) + : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {} + + CallInst *findTRECandidate(BasicBlock *BB, + bool CannotTailCallElimCallsMarkedTail); + + void createTailRecurseLoopHeader(CallInst *CI); + + void insertAccumulator(Instruction *AccRecInstr); + + bool eliminateCall(CallInst *CI); + + void cleanupAndFinalize(); + + bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail); + +public: + static bool eliminate(Function &F, const TargetTransformInfo *TTI, + AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, + DomTreeUpdater &DTU); +}; +} // namespace + +CallInst *TailRecursionEliminator::findTRECandidate( + BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) { + Instruction *TI = BB->getTerminator(); + + if (&BB->front() == TI) // Make sure there is something before the terminator. + return nullptr; + + // Scan backwards from the return, checking to see if there is a tail call in + // this block. If so, set CI to it. + CallInst *CI = nullptr; + BasicBlock::iterator BBI(TI); + while (true) { + CI = dyn_cast<CallInst>(BBI); + if (CI && CI->getCalledFunction() == &F) + break; + + if (BBI == BB->begin()) + return nullptr; // Didn't find a potential tail call. + --BBI; + } + + // If this call is marked as a tail call, and if there are dynamic allocas in + // the function, we cannot perform this optimization. + if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) + return nullptr; + + // As a special case, detect code like this: + // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call + // and disable this xform in this case, because the code generator will + // lower the call to fabs into inline code. + if (BB == &F.getEntryBlock() && + firstNonDbg(BB->front().getIterator()) == CI && + firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && + !TTI->isLoweredToCall(CI->getCalledFunction())) { + // A single-block function with just a call and a return. Check that + // the arguments match. + auto I = CI->arg_begin(), E = CI->arg_end(); + Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end(); + for (; I != E && FI != FE; ++I, ++FI) + if (*I != &*FI) break; + if (I == E && FI == FE) + return nullptr; + } + + return CI; +} + +void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { + HeaderBB = &F.getEntryBlock(); + BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB); + NewEntry->takeName(HeaderBB); + HeaderBB->setName("tailrecurse"); + BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry); + BI->setDebugLoc(CI->getDebugLoc()); + + // If this function has self recursive calls in the tail position where some + // are marked tail and some are not, only transform one flavor or another. + // We have to choose whether we move allocas in the entry block to the new + // entry block or not, so we can't make a good choice for both. We make this + // decision here based on whether the first call we found to remove is + // marked tail. + // NOTE: We could do slightly better here in the case that the function has + // no entry block allocas. + RemovableCallsMustBeMarkedTail = CI->isTailCall(); + + // If this tail call is marked 'tail' and if there are any allocas in the + // entry block, move them up to the new entry block. + if (RemovableCallsMustBeMarkedTail) + // Move all fixed sized allocas from HeaderBB to NewEntry. + for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), + NEBI = NewEntry->begin(); + OEBI != E;) + if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) + if (isa<ConstantInt>(AI->getArraySize())) + AI->moveBefore(&*NEBI); + + // Now that we have created a new block, which jumps to the entry + // block, insert a PHI node for each argument of the function. + // For now, we initialize each PHI to only have the real arguments + // which are passed in. + Instruction *InsertPos = &HeaderBB->front(); + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { + PHINode *PN = + PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos); + I->replaceAllUsesWith(PN); // Everyone use the PHI node now! + PN->addIncoming(&*I, NewEntry); + ArgumentPHIs.push_back(PN); + } + + // If the function doen't return void, create the RetPN and RetKnownPN PHI + // nodes to track our return value. We initialize RetPN with undef and + // RetKnownPN with false since we can't know our return value at function + // entry. + Type *RetType = F.getReturnType(); + if (!RetType->isVoidTy()) { + Type *BoolType = Type::getInt1Ty(F.getContext()); + RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos); + RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos); + + RetPN->addIncoming(UndefValue::get(RetType), NewEntry); + RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry); + } + + // The entry block was changed from HeaderBB to NewEntry. + // The forward DominatorTree needs to be recalculated when the EntryBB is + // changed. In this corner-case we recalculate the entire tree. + DTU.recalculate(*NewEntry->getParent()); +} + +void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) { + assert(!AccPN && "Trying to insert multiple accumulators"); + + AccumulatorRecursionInstr = AccRecInstr; + + // Start by inserting a new PHI node for the accumulator. + pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB); + AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1, + "accumulator.tr", &HeaderBB->front()); + + // Loop over all of the predecessors of the tail recursion block. For the + // real entry into the function we seed the PHI with the identity constant for + // the accumulation operation. For any other existing branches to this block + // (due to other tail recursions eliminated) the accumulator is not modified. + // Because we haven't added the branch in the current block to HeaderBB yet, + // it will not show up as a predecessor. + for (pred_iterator PI = PB; PI != PE; ++PI) { + BasicBlock *P = *PI; + if (P == &F.getEntryBlock()) { + Constant *Identity = ConstantExpr::getBinOpIdentity( + AccRecInstr->getOpcode(), AccRecInstr->getType()); + AccPN->addIncoming(Identity, P); + } else { + AccPN->addIncoming(AccPN, P); + } + } + + ++NumAccumAdded; +} + +bool TailRecursionEliminator::eliminateCall(CallInst *CI) { + ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator()); + + // Ok, we found a potential tail call. We can currently only transform the + // tail call if all of the instructions between the call and the return are + // movable to above the call itself, leaving the call next to the return. + // Check that this is the case now. + Instruction *AccRecInstr = nullptr; + BasicBlock::iterator BBI(CI); + for (++BBI; &*BBI != Ret; ++BBI) { + if (canMoveAboveCall(&*BBI, CI, AA)) + continue; + + // If we can't move the instruction above the call, it might be because it + // is an associative and commutative operation that could be transformed + // using accumulator recursion elimination. Check to see if this is the + // case, and if so, remember which instruction accumulates for later. + if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI)) + return false; // We cannot eliminate the tail recursion! + + // Yes, this is accumulator recursion. Remember which instruction + // accumulates. + AccRecInstr = &*BBI; + } + + BasicBlock *BB = Ret->getParent(); + + using namespace ore; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI) + << "transforming tail recursion into loop"; + }); + + // OK! We can transform this tail call. If this is the first one found, + // create the new entry block, allowing us to branch back to the old entry. + if (!HeaderBB) + createTailRecurseLoopHeader(CI); + + if (RemovableCallsMustBeMarkedTail && !CI->isTailCall()) + return false; + + // Ok, now that we know we have a pseudo-entry block WITH all of the + // required PHI nodes, add entries into the PHI node for the actual + // parameters passed into the tail-recursive call. + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) + ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); + + if (AccRecInstr) { + insertAccumulator(AccRecInstr); + + // Rewrite the accumulator recursion instruction so that it does not use + // the result of the call anymore, instead, use the PHI node we just + // inserted. + AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN); + } + + // Update our return value tracking + if (RetPN) { + if (Ret->getReturnValue() == CI || AccRecInstr) { + // Defer selecting a return value + RetPN->addIncoming(RetPN, BB); + RetKnownPN->addIncoming(RetKnownPN, BB); + } else { + // We found a return value we want to use, insert a select instruction to + // select it if we don't already know what our return value will be and + // store the result in our return value PHI node. + SelectInst *SI = SelectInst::Create( + RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret); + RetSelects.push_back(SI); + + RetPN->addIncoming(SI, BB); + RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB); + } + + if (AccPN) + AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB); + } + + // Now that all of the PHI nodes are in place, remove the call and + // ret instructions, replacing them with an unconditional branch. + BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret); + NewBI->setDebugLoc(CI->getDebugLoc()); + + BB->getInstList().erase(Ret); // Remove return. + BB->getInstList().erase(CI); // Remove call. + DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}}); + ++NumEliminated; + return true; +} + +void TailRecursionEliminator::cleanupAndFinalize() { + // If we eliminated any tail recursions, it's possible that we inserted some + // silly PHI nodes which just merge an initial value (the incoming operand) + // with themselves. Check to see if we did and clean up our mess if so. This + // occurs when a function passes an argument straight through to its tail + // call. + for (PHINode *PN : ArgumentPHIs) { + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + + if (RetPN) { + if (RetSelects.empty()) { + // If we didn't insert any select instructions, then we know we didn't + // store a return value and we can remove the PHI nodes we inserted. + RetPN->dropAllReferences(); + RetPN->eraseFromParent(); + + RetKnownPN->dropAllReferences(); + RetKnownPN->eraseFromParent(); + + if (AccPN) { + // We need to insert a copy of our accumulator instruction before any + // return in the function, and return its result instead. + Instruction *AccRecInstr = AccumulatorRecursionInstr; + for (BasicBlock &BB : F) { + ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()); + if (!RI) + continue; + + Instruction *AccRecInstrNew = AccRecInstr->clone(); + AccRecInstrNew->setName("accumulator.ret.tr"); + AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, + RI->getOperand(0)); + AccRecInstrNew->insertBefore(RI); + RI->setOperand(0, AccRecInstrNew); + } + } + } else { + // We need to insert a select instruction before any return left in the + // function to select our stored return value if we have one. + for (BasicBlock &BB : F) { + ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()); + if (!RI) + continue; + + SelectInst *SI = SelectInst::Create( + RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI); + RetSelects.push_back(SI); + RI->setOperand(0, SI); + } + + if (AccPN) { + // We need to insert a copy of our accumulator instruction before any + // of the selects we inserted, and select its result instead. + Instruction *AccRecInstr = AccumulatorRecursionInstr; + for (SelectInst *SI : RetSelects) { + Instruction *AccRecInstrNew = AccRecInstr->clone(); + AccRecInstrNew->setName("accumulator.ret.tr"); + AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN, + SI->getFalseValue()); + AccRecInstrNew->insertBefore(SI); + SI->setFalseValue(AccRecInstrNew); + } + } + } + } +} + +bool TailRecursionEliminator::processBlock( + BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) { + Instruction *TI = BB.getTerminator(); + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional()) + return false; + + BasicBlock *Succ = BI->getSuccessor(0); + ReturnInst *Ret = dyn_cast<ReturnInst>(Succ->getFirstNonPHIOrDbg(true)); + + if (!Ret) + return false; + + CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail); + + if (!CI) + return false; + + LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ + << "INTO UNCOND BRANCH PRED: " << BB); + FoldReturnIntoUncondBranch(Ret, Succ, &BB, &DTU); + ++NumRetDuped; + + // If all predecessors of Succ have been eliminated by + // FoldReturnIntoUncondBranch, delete it. It is important to empty it, + // because the ret instruction in there is still using a value which + // eliminateCall will attempt to remove. This block can only contain + // instructions that can't have uses, therefore it is safe to remove. + if (pred_empty(Succ)) + DTU.deleteBB(Succ); + + eliminateCall(CI); + return true; + } else if (isa<ReturnInst>(TI)) { + CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail); + + if (CI) + return eliminateCall(CI); + } + + return false; +} + +bool TailRecursionEliminator::eliminate(Function &F, + const TargetTransformInfo *TTI, + AliasAnalysis *AA, + OptimizationRemarkEmitter *ORE, + DomTreeUpdater &DTU) { + if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") + return false; + + bool MadeChange = false; + bool AllCallsAreTailCalls = false; + MadeChange |= markTails(F, AllCallsAreTailCalls, ORE); + if (!AllCallsAreTailCalls) + return MadeChange; + + // If this function is a varargs function, we won't be able to PHI the args + // right, so don't even try to convert it... + if (F.getFunctionType()->isVarArg()) + return MadeChange; + + // If false, we cannot perform TRE on tail calls marked with the 'tail' + // attribute, because doing so would cause the stack size to increase (real + // TRE would deallocate variable sized allocas, TRE doesn't). + bool CanTRETailMarkedCall = canTRE(F); + + // Change any tail recursive calls to loops. + TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU); + + for (BasicBlock &BB : F) + MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall); + + TRE.cleanupAndFinalize(); + + return MadeChange; +} + +namespace { +struct TailCallElim : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TailCallElim() : FunctionPass(ID) { + initializeTailCallElimPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<PostDominatorTreeWrapperPass>(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); + auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; + // There is no noticable performance difference here between Lazy and Eager + // UpdateStrategy based on some test results. It is feasible to switch the + // UpdateStrategy to Lazy if we find it profitable later. + DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); + + return TailRecursionEliminator::eliminate( + F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), + &getAnalysis<AAResultsWrapperPass>().getAAResults(), + &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU); + } +}; +} + +char TailCallElim::ID = 0; +INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", + false, false) + +// Public interface to the TailCallElimination pass +FunctionPass *llvm::createTailCallEliminationPass() { + return new TailCallElim(); +} + +PreservedAnalyses TailCallElimPass::run(Function &F, + FunctionAnalysisManager &AM) { + + TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AliasAnalysis &AA = AM.getResult<AAManager>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); + auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F); + // There is no noticable performance difference here between Lazy and Eager + // UpdateStrategy based on some test results. It is feasible to switch the + // UpdateStrategy to Lazy if we find it profitable later. + DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); + bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU); + + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<PostDominatorTreeAnalysis>(); + return PA; +} |