src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-22 20:31:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-22 20:31:01 +0000
commit	8bcb0991864975618c09697b1aca10683346d9f0 (patch)
tree	0afab28faa50e5f27698f8dd6c1921fff8d25e39 /contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
parent	b14637d118e110006a149a79b649c5695e7f419a (diff)
parent	1d5ae1026e831016fc29fd927877c86af904481f (diff)

Notes

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp

640

1 files changed, 530 insertions, 110 deletions

diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 0bc2bcff2ae1..9f93c68e6128 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp

@@ -66,7 +66,7 @@ using namespace llvm;

#define DEBUG_TYPE "loop-fusion"

-STATISTIC(FuseCounter, "Count number of loop fusions performed");

+STATISTIC(FuseCounter, "Loops fused");

STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");

STATISTIC(InvalidPreheader, "Loop has invalid preheader");

STATISTIC(InvalidHeader, "Loop has invalid header");

@@ -79,12 +79,15 @@ STATISTIC(MayThrowException, "Loop may throw an exception");

STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");

STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");

STATISTIC(InvalidDependencies, "Dependencies prevent fusion");

-STATISTIC(InvalidTripCount,

- "Loop does not have invariant backedge taken count");

+STATISTIC(UnknownTripCount, "Loop has unknown trip count");

STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");

-STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same");

-STATISTIC(NonAdjacent, "Candidates are not adjacent");

-STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader");

+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");

+STATISTIC(NonAdjacent, "Loops are not adjacent");

+STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");

+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");

+STATISTIC(NonIdenticalGuards, "Candidates have different guards");

+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");

+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");

enum FusionDependenceAnalysisChoice {

FUSION_DEPENDENCE_ANALYSIS_SCEV,

@@ -110,6 +113,7 @@ static cl::opt<bool>

cl::Hidden, cl::init(false), cl::ZeroOrMore);

#endif

+namespace {

/// This class is used to represent a candidate for loop fusion. When it is

/// constructed, it checks the conditions for loop fusion to ensure that it

/// represents a valid candidate. It caches several parts of a loop that are

@@ -143,6 +147,8 @@ struct FusionCandidate {

SmallVector<Instruction *, 16> MemWrites;

/// Are all of the members of this fusion candidate still valid

bool Valid;

+ /// Guard branch of the loop, if it exists

+ BranchInst *GuardBranch;

/// Dominator and PostDominator trees are needed for the

/// FusionCandidateCompare function, required by FusionCandidateSet to

@@ -151,11 +157,20 @@ struct FusionCandidate {

const DominatorTree *DT;

const PostDominatorTree *PDT;

+ OptimizationRemarkEmitter &ORE;

FusionCandidate(Loop *L, const DominatorTree *DT,

- const PostDominatorTree *PDT)

+ const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)

: Preheader(L->getLoopPreheader()), Header(L->getHeader()),

ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),

- Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) {

+ Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),

+ DT(DT), PDT(PDT), ORE(ORE) {

+ // TODO: This is temporary while we fuse both rotated and non-rotated

+ // loops. Once we switch to only fusing rotated loops, the initialization of

+ // GuardBranch can be moved into the initialization list above.

+ if (isRotated())

+ GuardBranch = L->getLoopGuardBranch();

// Walk over all blocks in the loop and check for conditions that may

// prevent fusion. For each block, walk over all instructions and collect

@@ -163,28 +178,28 @@ struct FusionCandidate {

// found, invalidate this object and return.

for (BasicBlock *BB : L->blocks()) {

if (BB->hasAddressTaken()) {

- AddressTakenBB++;

invalidate();

+ reportInvalidCandidate(AddressTakenBB);

return;

}

for (Instruction &I : *BB) {

if (I.mayThrow()) {

- MayThrowException++;

invalidate();

+ reportInvalidCandidate(MayThrowException);

return;

}

if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {

if (SI->isVolatile()) {

- ContainsVolatileAccess++;

invalidate();

+ reportInvalidCandidate(ContainsVolatileAccess);

return;

}

if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {

if (LI->isVolatile()) {

- ContainsVolatileAccess++;

invalidate();

+ reportInvalidCandidate(ContainsVolatileAccess);

return;

}

@@ -214,19 +229,96 @@ struct FusionCandidate {

assert(Latch == L->getLoopLatch() && "Latch is out of sync");

}

+ /// Get the entry block for this fusion candidate.

+ ///

+ /// If this fusion candidate represents a guarded loop, the entry block is the

+ /// loop guard block. If it represents an unguarded loop, the entry block is

+ /// the preheader of the loop.

+ BasicBlock *getEntryBlock() const {

+ if (GuardBranch)

+ return GuardBranch->getParent();

+ else

+ return Preheader;

+ }

+ /// Given a guarded loop, get the successor of the guard that is not in the

+ /// loop.

+ ///

+ /// This method returns the successor of the loop guard that is not located

+ /// within the loop (i.e., the successor of the guard that is not the

+ /// preheader).

+ /// This method is only valid for guarded loops.

+ BasicBlock *getNonLoopBlock() const {

+ assert(GuardBranch && "Only valid on guarded loops.");

+ assert(GuardBranch->isConditional() &&

+ "Expecting guard to be a conditional branch.");

+ return (GuardBranch->getSuccessor(0) == Preheader)

+ ? GuardBranch->getSuccessor(1)

+ : GuardBranch->getSuccessor(0);

+ }

+ bool isRotated() const {

+ assert(L && "Expecting loop to be valid.");

+ assert(Latch && "Expecting latch to be valid.");

+ return L->isLoopExiting(Latch);

+ }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

LLVM_DUMP_METHOD void dump() const {

- dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")

+ dbgs() << "\tGuardBranch: "

+ << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"

+ << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")

<< "\n"

<< "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"

<< "\tExitingBB: "

<< (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"

<< "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")

<< "\n"

- << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";

+ << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"

+ << "\tEntryBlock: "

+ << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")

+ << "\n";

}

#endif

+ /// Determine if a fusion candidate (representing a loop) is eligible for

+ /// fusion. Note that this only checks whether a single loop can be fused - it

+ /// does not check whether it is *legal* to fuse two loops together.

+ bool isEligibleForFusion(ScalarEvolution &SE) const {

+ if (!isValid()) {

+ LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");

+ if (!Preheader)

+ ++InvalidPreheader;

+ if (!Header)

+ ++InvalidHeader;

+ if (!ExitingBlock)

+ ++InvalidExitingBlock;

+ if (!ExitBlock)

+ ++InvalidExitBlock;

+ if (!Latch)

+ ++InvalidLatch;

+ if (L->isInvalid())

+ ++InvalidLoop;

+ return false;

+ }

+ // Require ScalarEvolution to be able to determine a trip count.

+ if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {

+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()

+ << " trip count not computable!\n");

+ return reportInvalidCandidate(UnknownTripCount);

+ }

+ if (!L->isLoopSimplifyForm()) {

+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()

+ << " is not in simplified form!\n");

+ return reportInvalidCandidate(NotSimplifiedForm);

+ }

+ return true;

+ }

private:

// This is only used internally for now, to clear the MemWrites and MemReads

// list and setting Valid to false. I can't envision other uses of this right

@@ -239,17 +331,18 @@ private:

MemReads.clear();

Valid = false;

}

-};

-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,

- const FusionCandidate &FC) {

- if (FC.isValid())

- OS << FC.Preheader->getName();

- else

- OS << "<Invalid>";

- return OS;

+ bool reportInvalidCandidate(llvm::Statistic &Stat) const {

+ using namespace ore;

+ assert(L && Preheader && "Fusion candidate not initialized properly!");

+ ++Stat;

+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),

+ L->getStartLoc(), Preheader)

+ << "[" << Preheader->getParent()->getName() << "]: "

+ << "Loop is not a candidate for fusion: " << Stat.getDesc());

+ return false;

+ }

+};

struct FusionCandidateCompare {

/// Comparison functor to sort two Control Flow Equivalent fusion candidates

@@ -260,21 +353,24 @@ struct FusionCandidateCompare {

const FusionCandidate &RHS) const {

const DominatorTree *DT = LHS.DT;

+ BasicBlock *LHSEntryBlock = LHS.getEntryBlock();

+ BasicBlock *RHSEntryBlock = RHS.getEntryBlock();

// Do not save PDT to local variable as it is only used in asserts and thus

// will trigger an unused variable warning if building without asserts.

assert(DT && LHS.PDT && "Expecting valid dominator tree");

// Do this compare first so if LHS == RHS, function returns false.

- if (DT->dominates(RHS.Preheader, LHS.Preheader)) {

+ if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {

// RHS dominates LHS

// Verify LHS post-dominates RHS

- assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));

+ assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));

return false;

}

- if (DT->dominates(LHS.Preheader, RHS.Preheader)) {

+ if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {

// Verify RHS Postdominates LHS

- assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));

+ assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));

return true;

}

@@ -286,7 +382,6 @@ struct FusionCandidateCompare {

}

};

-namespace {

using LoopVector = SmallVector<Loop *, 4>;

// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance

@@ -301,17 +396,26 @@ using LoopVector = SmallVector<Loop *, 4>;

// keeps the FusionCandidateSet sorted will also simplify the implementation.

using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;

using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;

-} // namespace

-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,

+#if !defined(NDEBUG)

+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,

+ const FusionCandidate &FC) {

+ if (FC.isValid())

+ OS << FC.Preheader->getName();

+ else

+ OS << "<Invalid>";

+ return OS;

+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,

const FusionCandidateSet &CandSet) {

- for (auto IT : CandSet)

- OS << IT << "\n";

+ for (const FusionCandidate &FC : CandSet)

+ OS << FC << '\n';

return OS;

}

-#if !defined(NDEBUG)

static void

printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {

dbgs() << "Fusion Candidates: \n";

@@ -391,16 +495,6 @@ static void printLoopVector(const LoopVector &LV) {

}

#endif

-static void reportLoopFusion(const FusionCandidate &FC0,

- const FusionCandidate &FC1,

- OptimizationRemarkEmitter &ORE) {

- using namespace ore;

- ORE.emit(

- OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent())

- << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName()))

- << " with " << NV("Cand2", StringRef(FC1.Preheader->getName())));

struct LoopFuser {

private:

// Sets of control flow equivalent fusion candidates for a given nest level.

@@ -497,53 +591,16 @@ private:

const FusionCandidate &FC1) const {

assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");

- if (DT.dominates(FC0.Preheader, FC1.Preheader))

- return PDT.dominates(FC1.Preheader, FC0.Preheader);

+ BasicBlock *FC0EntryBlock = FC0.getEntryBlock();

+ BasicBlock *FC1EntryBlock = FC1.getEntryBlock();

- if (DT.dominates(FC1.Preheader, FC0.Preheader))

- return PDT.dominates(FC0.Preheader, FC1.Preheader);

+ if (DT.dominates(FC0EntryBlock, FC1EntryBlock))

+ return PDT.dominates(FC1EntryBlock, FC0EntryBlock);

- return false;

- }

- /// Determine if a fusion candidate (representing a loop) is eligible for

- /// fusion. Note that this only checks whether a single loop can be fused - it

- /// does not check whether it is *legal* to fuse two loops together.

- bool eligibleForFusion(const FusionCandidate &FC) const {

- if (!FC.isValid()) {

- LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n");

- if (!FC.Preheader)

- InvalidPreheader++;

- if (!FC.Header)

- InvalidHeader++;

- if (!FC.ExitingBlock)

- InvalidExitingBlock++;

- if (!FC.ExitBlock)

- InvalidExitBlock++;

- if (!FC.Latch)

- InvalidLatch++;

- if (FC.L->isInvalid())

- InvalidLoop++;

+ if (DT.dominates(FC1EntryBlock, FC0EntryBlock))

+ return PDT.dominates(FC0EntryBlock, FC1EntryBlock);

- return false;

- }

- // Require ScalarEvolution to be able to determine a trip count.

- if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) {

- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()

- << " trip count not computable!\n");

- InvalidTripCount++;

- return false;

- }

- if (!FC.L->isLoopSimplifyForm()) {

- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()

- << " is not in simplified form!\n");

- NotSimplifiedForm++;

- return false;

- }

- return true;

+ return false;

}

/// Iterate over all loops in the given loop set and identify the loops that

@@ -551,8 +608,8 @@ private:

/// Flow Equivalent sets, sorted by dominance.

void collectFusionCandidates(const LoopVector &LV) {

for (Loop *L : LV) {

- FusionCandidate CurrCand(L, &DT, &PDT);

- if (!eligibleForFusion(CurrCand))

+ FusionCandidate CurrCand(L, &DT, &PDT, ORE);

+ if (!CurrCand.isEligibleForFusion(SE))

continue;

// Go through each list in FusionCandidates and determine if L is control

@@ -664,31 +721,64 @@ private:

if (!identicalTripCounts(*FC0, *FC1)) {

LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "

"counts. Not fusing.\n");

- NonEqualTripCount++;

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ NonEqualTripCount);

continue;

}

if (!isAdjacent(*FC0, *FC1)) {

LLVM_DEBUG(dbgs()

<< "Fusion candidates are not adjacent. Not fusing.\n");

- NonAdjacent++;

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);

continue;

}

- // For now we skip fusing if the second candidate has any instructions

- // in the preheader. This is done because we currently do not have the

- // safety checks to determine if it is save to move the preheader of

- // the second candidate past the body of the first candidate. Once

- // these checks are added, this condition can be removed.

+ // Ensure that FC0 and FC1 have identical guards.

+ // If one (or both) are not guarded, this check is not necessary.

+ if (FC0->GuardBranch && FC1->GuardBranch &&

+ !haveIdenticalGuards(*FC0, *FC1)) {

+ LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "

+ "guards. Not Fusing.\n");

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ NonIdenticalGuards);

+ continue;

+ }

+ // The following three checks look for empty blocks in FC0 and FC1. If

+ // any of these blocks are non-empty, we do not fuse. This is done

+ // because we currently do not have the safety checks to determine if

+ // it is safe to move the blocks past other blocks in the loop. Once

+ // these checks are added, these conditions can be relaxed.

if (!isEmptyPreheader(*FC1)) {

LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "

"preheader. Not fusing.\n");

- NonEmptyPreheader++;

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ NonEmptyPreheader);

+ continue;

+ }

+ if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {

+ LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "

+ "block. Not fusing.\n");

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ NonEmptyExitBlock);

+ continue;

+ }

+ if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {

+ LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "

+ "block. Not fusing.\n");

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ NonEmptyGuardBlock);

continue;

}

+ // Check the dependencies across the loops and do not fuse if it would

+ // violate them.

if (!dependencesAllowFusion(*FC0, *FC1)) {

LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ InvalidDependencies);

continue;

}

@@ -696,9 +786,11 @@ private:

LLVM_DEBUG(dbgs()

<< "\tFusion appears to be "

<< (BeneficialToFuse ? "" : "un") << "profitable!\n");

- if (!BeneficialToFuse)

+ if (!BeneficialToFuse) {

+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,

+ FusionNotBeneficial);

continue;

+ }

// All analysis has completed and has determined that fusion is legal

// and profitable. At this point, start transforming the code and

// perform fusion.

@@ -710,15 +802,14 @@ private:

// Note this needs to be done *before* performFusion because

// performFusion will change the original loops, making it not

// possible to identify them after fusion is complete.

- reportLoopFusion(*FC0, *FC1, ORE);

+ reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);

- FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT);

+ FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);

FusedCand.verify();

- assert(eligibleForFusion(FusedCand) &&

+ assert(FusedCand.isEligibleForFusion(SE) &&

"Fused candidate should be eligible for fusion!");

// Notify the loop-depth-tree that these loops are not valid objects

- // anymore.

LDT.removeLoop(FC1->L);

CandidateSet.erase(FC0);

@@ -889,7 +980,7 @@ private:

LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1

<< "\n");

assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());

- assert(DT.dominates(FC0.Preheader, FC1.Preheader));

+ assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));

for (Instruction *WriteL0 : FC0.MemWrites) {

for (Instruction *WriteL1 : FC1.MemWrites)

@@ -939,18 +1030,89 @@ private:

return true;

}

- /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this

- /// case, there is no code in between the two fusion candidates, thus making

- /// them adjacent.

+ /// Determine if two fusion candidates are adjacent in the CFG.

+ ///

+ /// This method will determine if there are additional basic blocks in the CFG

+ /// between the exit of \p FC0 and the entry of \p FC1.

+ /// If the two candidates are guarded loops, then it checks whether the

+ /// non-loop successor of the \p FC0 guard branch is the entry block of \p

+ /// FC1. If not, then the loops are not adjacent. If the two candidates are

+ /// not guarded loops, then it checks whether the exit block of \p FC0 is the

+ /// preheader of \p FC1.

bool isAdjacent(const FusionCandidate &FC0,

const FusionCandidate &FC1) const {

- return FC0.ExitBlock == FC1.Preheader;

+ // If the successor of the guard branch is FC1, then the loops are adjacent

+ if (FC0.GuardBranch)

+ return FC0.getNonLoopBlock() == FC1.getEntryBlock();

+ else

+ return FC0.ExitBlock == FC1.getEntryBlock();

+ }

+ /// Determine if two fusion candidates have identical guards

+ ///

+ /// This method will determine if two fusion candidates have the same guards.

+ /// The guards are considered the same if:

+ /// 1. The instructions to compute the condition used in the compare are

+ /// identical.

+ /// 2. The successors of the guard have the same flow into/around the loop.

+ /// If the compare instructions are identical, then the first successor of the

+ /// guard must go to the same place (either the preheader of the loop or the

+ /// NonLoopBlock). In other words, the the first successor of both loops must

+ /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,

+ /// the NonLoopBlock). The same must be true for the second successor.

+ bool haveIdenticalGuards(const FusionCandidate &FC0,

+ const FusionCandidate &FC1) const {

+ assert(FC0.GuardBranch && FC1.GuardBranch &&

+ "Expecting FC0 and FC1 to be guarded loops.");

+ if (auto FC0CmpInst =

+ dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))

+ if (auto FC1CmpInst =

+ dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))

+ if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))

+ return false;

+ // The compare instructions are identical.

+ // Now make sure the successor of the guards have the same flow into/around

+ // the loop

+ if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)

+ return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);

+ else

+ return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);

+ }

+ /// Check that the guard for \p FC *only* contains the cmp/branch for the

+ /// guard.

+ /// Once we are able to handle intervening code, any code in the guard block

+ /// for FC1 will need to be treated as intervening code and checked whether

+ /// it can safely move around the loops.

+ bool isEmptyGuardBlock(const FusionCandidate &FC) const {

+ assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");

+ if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {

+ auto *GuardBlock = FC.GuardBranch->getParent();

+ // If the generation of the cmp value is in GuardBlock, then the size of

+ // the guard block should be 2 (cmp + branch). If the generation of the

+ // cmp value is in a different block, then the size of the guard block

+ // should only be 1.

+ if (CmpInst->getParent() == GuardBlock)

+ return GuardBlock->size() == 2;

+ else

+ return GuardBlock->size() == 1;

+ }

+ return false;

}

bool isEmptyPreheader(const FusionCandidate &FC) const {

+ assert(FC.Preheader && "Expecting a valid preheader");

return FC.Preheader->size() == 1;

}

+ bool isEmptyExitBlock(const FusionCandidate &FC) const {

+ assert(FC.ExitBlock && "Expecting a valid exit block");

+ return FC.ExitBlock->size() == 1;

+ }

/// Fuse two fusion candidates, creating a new fused loop.

///

/// This method contains the mechanics of fusing two loops, represented by \p

@@ -987,6 +1149,12 @@ private:

LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();

dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););

+ // Fusing guarded loops is handled slightly differently than non-guarded

+ // loops and has been broken out into a separate method instead of trying to

+ // intersperse the logic within a single method.

+ if (FC0.GuardBranch)

+ return fuseGuardedLoops(FC0, FC1);

assert(FC1.Preheader == FC0.ExitBlock);

assert(FC1.Preheader->size() == 1 &&

FC1.Preheader->getSingleSuccessor() == FC1.Header);

@@ -1131,7 +1299,258 @@ private:

SE.verify();

#endif

- FuseCounter++;

+ LLVM_DEBUG(dbgs() << "Fusion done:\n");

+ return FC0.L;

+ }

+ /// Report details on loop fusion opportunities.

+ ///

+ /// This template function can be used to report both successful and missed

+ /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should

+ /// be one of:

+ /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful

+ /// given two valid fusion candidates.

+ /// - OptimizationRemark to report successful fusion of two fusion

+ /// candidates.

+ /// The remarks will be printed using the form:

+ /// <path/filename>:<line number>:<column number>: [<function name>]:

+ /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>

+ template <typename RemarkKind>

+ void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,

+ llvm::Statistic &Stat) {

+ assert(FC0.Preheader && FC1.Preheader &&

+ "Expecting valid fusion candidates");

+ using namespace ore;

+ ++Stat;

+ ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),

+ FC0.Preheader)

+ << "[" << FC0.Preheader->getParent()->getName()

+ << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))

+ << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))

+ << ": " << Stat.getDesc());

+ }

+ /// Fuse two guarded fusion candidates, creating a new fused loop.

+ ///

+ /// Fusing guarded loops is handled much the same way as fusing non-guarded

+ /// loops. The rewiring of the CFG is slightly different though, because of

+ /// the presence of the guards around the loops and the exit blocks after the

+ /// loop body. As such, the new loop is rewired as follows:

+ /// 1. Keep the guard branch from FC0 and use the non-loop block target

+ /// from the FC1 guard branch.

+ /// 2. Remove the exit block from FC0 (this exit block should be empty

+ /// right now).

+ /// 3. Remove the guard branch for FC1

+ /// 4. Remove the preheader for FC1.

+ /// The exit block successor for the latch of FC0 is updated to be the header

+ /// of FC1 and the non-exit block successor of the latch of FC1 is updated to

+ /// be the header of FC0, thus creating the fused loop.

+ Loop *fuseGuardedLoops(const FusionCandidate &FC0,

+ const FusionCandidate &FC1) {

+ assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");

+ BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();

+ BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();

+ BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();

+ BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();

+ assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");

+ SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;

+ ////////////////////////////////////////////////////////////////////////////

+ // Update the Loop Guard

+ ////////////////////////////////////////////////////////////////////////////

+ // The guard for FC0 is updated to guard both FC0 and FC1. This is done by

+ // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.

+ // Thus, one path from the guard goes to the preheader for FC0 (and thus

+ // executes the new fused loop) and the other path goes to the NonLoopBlock

+ // for FC1 (where FC1 guard would have gone if FC1 was not executed).

+ FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);

+ FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,

+ FC1.Header);

+ // The guard of FC1 is not necessary anymore.

+ FC1.GuardBranch->eraseFromParent();

+ new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));

+ assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&

+ "Expecting guard block to have no predecessors");

+ assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&

+ "Expecting guard block to have no successors");

+ // Remember the phi nodes originally in the header of FC0 in order to rewire

+ // them later. However, this is only necessary if the new loop carried

+ // values might not dominate the exiting branch. While we do not generally

+ // test if this is the case but simply insert intermediate phi nodes, we

+ // need to make sure these intermediate phi nodes have different

+ // predecessors. To this end, we filter the special case where the exiting

+ // block is the latch block of the first loop. Nothing needs to be done

+ // anyway as all loop carried values dominate the latch and thereby also the

+ // exiting branch.

+ // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch

+ // (because the loops are rotated. Thus, nothing will ever be added to

+ // OriginalFC0PHIs.

+ SmallVector<PHINode *, 8> OriginalFC0PHIs;

+ if (FC0.ExitingBlock != FC0.Latch)

+ for (PHINode &PHI : FC0.Header->phis())

+ OriginalFC0PHIs.push_back(&PHI);

+ assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");

+ // Replace incoming blocks for header PHIs first.

+ FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);

+ FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);

+ // The old exiting block of the first loop (FC0) has to jump to the header

+ // of the second as we need to execute the code in the second header block

+ // regardless of the trip count. That is, if the trip count is 0, so the

+ // back edge is never taken, we still have to execute both loop headers,

+ // especially (but not only!) if the second is a do-while style loop.

+ // However, doing so might invalidate the phi nodes of the first loop as

+ // the new values do only need to dominate their latch and not the exiting

+ // predicate. To remedy this potential problem we always introduce phi

+ // nodes in the header of the second loop later that select the loop carried

+ // value, if the second header was reached through an old latch of the

+ // first, or undef otherwise. This is sound as exiting the first implies the

+ // second will exit too, __without__ taking the back-edge (their

+ // trip-counts are equal after all).

+ FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,

+ FC1.Header);

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));

+ // Remove FC0 Exit Block

+ // The exit block for FC0 is no longer needed since control will flow

+ // directly to the header of FC1. Since it is an empty block, it can be

+ // removed at this point.

+ // TODO: In the future, we can handle non-empty exit blocks my merging any

+ // instructions from FC0 exit block into FC1 exit block prior to removing

+ // the block.

+ assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&

+ "Expecting exit block to be empty");

+ FC0.ExitBlock->getTerminator()->eraseFromParent();

+ new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);

+ // Remove FC1 Preheader

+ // The pre-header of L1 is not necessary anymore.

+ assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));

+ FC1.Preheader->getTerminator()->eraseFromParent();

+ new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Delete, FC1.Preheader, FC1.Header));

+ // Moves the phi nodes from the second to the first loops header block.

+ while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {

+ if (SE.isSCEVable(PHI->getType()))

+ SE.forgetValue(PHI);

+ if (PHI->hasNUsesOrMore(1))

+ PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());

+ else

+ PHI->eraseFromParent();

+ }

+ // Introduce new phi nodes in the second loop header to ensure

+ // exiting the first and jumping to the header of the second does not break

+ // the SSA property of the phis originally in the first loop. See also the

+ // comment above.

+ Instruction *L1HeaderIP = &FC1.Header->front();

+ for (PHINode *LCPHI : OriginalFC0PHIs) {

+ int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);

+ assert(L1LatchBBIdx >= 0 &&

+ "Expected loop carried value to be rewired at this point!");

+ Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);

+ PHINode *L1HeaderPHI = PHINode::Create(

+ LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);

+ L1HeaderPHI->addIncoming(LCV, FC0.Latch);

+ L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),

+ FC0.ExitingBlock);

+ LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);

+ }

+ // Update the latches

+ // Replace latch terminator destinations.

+ FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);

+ FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);

+ // If FC0.Latch and FC0.ExitingBlock are the same then we have already

+ // performed the updates above.

+ if (FC0.Latch != FC0.ExitingBlock)

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(

+ DominatorTree::Insert, FC0.Latch, FC1.Header));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,

+ FC0.Latch, FC0.Header));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,

+ FC1.Latch, FC0.Header));

+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,

+ FC1.Latch, FC1.Header));

+ // All done

+ // Apply the updates to the Dominator Tree and cleanup.

+ assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&

+ "FC1GuardBlock has successors!!");

+ assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&

+ "FC1GuardBlock has predecessors!!");

+ // Update DT/PDT

+ DTU.applyUpdates(TreeUpdates);

+ LI.removeBlock(FC1.Preheader);

+ DTU.deleteBB(FC1.Preheader);

+ DTU.deleteBB(FC0.ExitBlock);

+ DTU.flush();

+ // Is there a way to keep SE up-to-date so we don't need to forget the loops

+ // and rebuild the information in subsequent passes of fusion?

+ SE.forgetLoop(FC1.L);

+ SE.forgetLoop(FC0.L);

+ // Merge the loops.

+ SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),

+ FC1.L->block_end());

+ for (BasicBlock *BB : Blocks) {

+ FC0.L->addBlockEntry(BB);

+ FC1.L->removeBlockFromLoop(BB);

+ if (LI.getLoopFor(BB) != FC1.L)

+ continue;

+ LI.changeLoopFor(BB, FC0.L);

+ }

+ while (!FC1.L->empty()) {

+ const auto &ChildLoopIt = FC1.L->begin();

+ Loop *ChildLoop = *ChildLoopIt;

+ FC1.L->removeChildLoop(ChildLoopIt);

+ FC0.L->addChildLoop(ChildLoop);

+ }

+ // Delete the now empty loop L1.

+ LI.erase(FC1.L);

+#ifndef NDEBUG

+ assert(!verifyFunction(*FC0.Header->getParent(), &errs()));

+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));

+ assert(PDT.verify());

+ LI.verify(DT);

+ SE.verify();

+#endif

LLVM_DEBUG(dbgs() << "Fusion done:\n");

@@ -1177,6 +1596,7 @@ struct LoopFuseLegacy : public FunctionPass {

return LF.fuseLoops(F);

}

};

+} // namespace

PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {

auto &LI = AM.getResult<LoopAnalysis>(F);