diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
| commit | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch) | |
| tree | 4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Scalar | |
| parent | 7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff) | |
Notes
Diffstat (limited to 'llvm/lib/Transforms/Scalar')
72 files changed, 2122 insertions, 1568 deletions
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp index 7f7460c5746a7..cc3d3bf7cdbf1 100644 --- a/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 0e9f03a060611..06deaf3c4f9a9 100644 --- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" diff --git a/llvm/lib/Transforms/Scalar/BDCE.cpp b/llvm/lib/Transforms/Scalar/BDCE.cpp index 9bd387c33e80a..0fa38fa80b178 100644 --- a/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -19,13 +19,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "bdce" @@ -101,7 +102,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { (I.getType()->isIntOrIntVectorTy() && DB.getDemandedBits(&I).isNullValue() && wouldInstructionBeTriviallyDead(&I))) { - salvageDebugInfo(I); + salvageDebugInfoOrMarkUndef(I); Worklist.push_back(&I); I.dropAllReferences(); Changed = true; diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index c3fba923104fb..e34c011b1c871 100644 --- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -59,13 +59,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 9f340afbf7c28..5bfece010bec2 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -14,7 +14,7 @@ // cost. If the constant can be folded into the instruction (the cost is // TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't // consider it expensive and leave it alone. This is the default behavior and -// the default implementation of getIntImmCost will always return TCC_Free. +// the default implementation of getIntImmCostInst will always return TCC_Free. // // If the cost is more than TCC_BASIC, then the integer constant can't be folded // into the instruction and it might be beneficial to hoist the constant. @@ -43,7 +43,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -54,6 +53,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/Casting.h" @@ -61,6 +61,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <algorithm> #include <cassert> @@ -361,11 +362,11 @@ void ConstantHoistingPass::collectConstantCandidates( // Ask the target about the cost of materializing the constant for the given // instruction and operand index. if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) - Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx, - ConstInt->getValue(), ConstInt->getType()); + Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx, + ConstInt->getValue(), ConstInt->getType()); else - Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(), - ConstInt->getType()); + Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(), + ConstInt->getType()); // Ignore cheap integer constants. if (Cost > TargetTransformInfo::TCC_Basic) { @@ -415,7 +416,7 @@ void ConstantHoistingPass::collectConstantCandidates( // usually lowered to a load from constant pool. Such operation is unlikely // to be cheaper than compute it by <Base + Offset>, which can be lowered to // an ADD instruction or folded into Load/Store instruction. - int Cost = TTI->getIntImmCost(Instruction::Add, 1, Offset, PtrIntTy); + int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy); ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV]; ConstCandMapType::iterator Itr; bool Inserted; @@ -486,9 +487,10 @@ void ConstantHoistingPass::collectConstantCandidates( // Scan all operands. for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { // The cost of materializing the constants (defined in - // `TargetTransformInfo::getIntImmCost`) for instructions which only take - // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So - // it's safe for us to collect constant candidates from all IntrinsicInsts. + // `TargetTransformInfo::getIntImmCostInst`) for instructions which only + // take constant variables is lower than `TargetTransformInfo::TCC_Basic`. + // So it's safe for us to collect constant candidates from all + // IntrinsicInsts. if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) { collectConstantCandidates(ConstCandMap, Inst, Idx); } @@ -499,9 +501,13 @@ void ConstantHoistingPass::collectConstantCandidates( /// into an instruction itself. void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (BasicBlock &BB : Fn) + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; for (Instruction &Inst : BB) collectConstantCandidates(ConstCandMap, &Inst); + } } // This helper function is necessary to deal with values that have different @@ -552,7 +558,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, unsigned NumUses = 0; bool OptForSize = Entry->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI); + llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI, + PGSOQueryType::IRPass); if (!OptForSize || std::distance(S,E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); @@ -575,7 +582,7 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, for (auto User : ConstCand->Uses) { unsigned Opcode = User.Inst->getOpcode(); unsigned OpndIdx = User.OpndIdx; - Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty); + Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty); LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n"); for (auto C2 = S; C2 != E; ++C2) { diff --git a/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/llvm/lib/Transforms/Scalar/ConstantProp.cpp index e9e6afe3fdd41..73bf1d521b1d0 100644 --- a/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 2ef85268df483..3435bc7f5eaaa 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -194,7 +195,14 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, } // All constant incoming values map to the same variable along the incoming - // edges of the phi. The phi is unnecessary. + // edges of the phi. The phi is unnecessary. However, we must drop all + // poison-generating flags to ensure that no poison is propagated to the phi + // location by performing this substitution. + // Warning: If the underlying analysis changes, this may not be enough to + // guarantee that poison is not propagated. + // TODO: We may be able to re-infer flags by re-analyzing the instruction. + if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) + CommonInst->dropPoisonGeneratingFlags(); P->replaceAllUsesWith(CommonValue); P->eraseFromParent(); ++NumPhiCommon; diff --git a/llvm/lib/Transforms/Scalar/DCE.cpp b/llvm/lib/Transforms/Scalar/DCE.cpp index a79d775aa7f3a..a4b0c8df98f64 100644 --- a/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/llvm/lib/Transforms/Scalar/DCE.cpp @@ -19,12 +19,14 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "dce" @@ -80,6 +82,43 @@ Pass *llvm::createDeadInstEliminationPass() { return new DeadInstElimination(); } +//===--------------------------------------------------------------------===// +// RedundantDbgInstElimination pass implementation +// + +namespace { +struct RedundantDbgInstElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RedundantDbgInstElimination() : FunctionPass(ID) { + initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + bool Changed = false; + for (auto &BB : F) + Changed |= RemoveRedundantDbgInstrs(&BB); + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; +} + +char RedundantDbgInstElimination::ID = 0; +INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim", + "Redundant Dbg Instruction Elimination", false, false) + +Pass *llvm::createRedundantDbgInstEliminationPass() { + return new RedundantDbgInstElimination(); +} + +//===--------------------------------------------------------------------===// +// DeadCodeElimination pass implementation +// + static bool DCEInstruction(Instruction *I, SmallSetVector<Instruction *, 16> &WorkList, const TargetLibraryInfo *TLI) { diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 685de82810ede..1ba4aab999e14 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -48,6 +49,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -99,6 +101,7 @@ static void deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, MemoryDependenceResults &MD, const TargetLibraryInfo &TLI, InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst, SmallSetVector<const Value *, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -112,6 +115,10 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); + // Mark the DeadInst as dead in the list of throwable instructions. + auto It = ThrowableInst.find(DeadInst); + if (It != ThrowableInst.end()) + ThrowableInst[It->first] = false; ++NumFastOther; // Try to preserve debug information attached to the dead instruction. @@ -144,6 +151,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, DeadInst->eraseFromParent(); } while (!NowDeadInsts.empty()); *BBI = NewIter; + // Pop dead entries from back of ThrowableInst till we find an alive entry. + while (!ThrowableInst.empty() && !ThrowableInst.back().second) + ThrowableInst.pop_back(); } /// Does this instruction write some memory? This only returns true for things @@ -169,15 +179,18 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - StringRef FnName = F->getName(); - if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy)) - return true; - if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy)) - return true; - if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat)) - return true; - if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat)) - return true; + LibFunc LF; + if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) { + switch (LF) { + case LibFunc_strcpy: + case LibFunc_strncpy: + case LibFunc_strcat: + case LibFunc_strncat: + return true; + default: + return false; + } + } } } return false; @@ -656,7 +669,8 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryDependenceResults *MD, DominatorTree *DT, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); @@ -690,7 +704,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); - deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -747,8 +762,8 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc, static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, MemoryDependenceResults *MD, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; // Keep track of all of the stack objects that are dead at the end of the @@ -809,7 +824,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, << '\n'); // DCE instructions only used to calculate that store. - deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastStores; MadeChange = true; @@ -821,7 +836,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (isInstructionTriviallyDead(&*BBI, TLI)) { LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: " << *&*BBI << '\n'); - deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastOther; MadeChange = true; @@ -1028,7 +1043,8 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, const DataLayout &DL, const TargetLibraryInfo *TLI, InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { // Must be a store instruction. StoreInst *SI = dyn_cast<StoreInst>(Inst); if (!SI) @@ -1044,7 +1060,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1062,7 +1078,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: " << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1077,7 +1093,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, bool MadeChange = false; OrderedBasicBlock OBB(&BB); - Instruction *LastThrowing = nullptr; + MapVector<Instruction *, bool> ThrowableInst; // A map of interval maps representing partially-overwritten value parts. InstOverlapIntervalsTy IOL; @@ -1086,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { // Handle 'free' calls specially. if (CallInst *F = isFreeCall(&*BBI, TLI)) { - MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB); + MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB, ThrowableInst); // Increment BBI after handleFree has potentially deleted instructions. // This ensures we maintain a valid iterator. ++BBI; @@ -1096,7 +1112,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, Instruction *Inst = &*BBI++; if (Inst->mayThrow()) { - LastThrowing = Inst; + ThrowableInst[Inst] = true; continue; } @@ -1105,7 +1121,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, continue; // eliminateNoopStore will update in iterator, if necessary. - if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB)) { + if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB, + ThrowableInst)) { MadeChange = true; continue; } @@ -1148,6 +1165,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (!DepLoc.Ptr) break; + // Find the last throwable instruction not removed by call to + // deleteDeadInstruction. + Instruction *LastThrowing = nullptr; + if (!ThrowableInst.empty()) + LastThrowing = ThrowableInst.back().first; + // Make sure we don't look past a call which might throw. This is an // issue because MemoryDependenceAnalysis works in the wrong direction: // it finds instructions which dominate the current instruction, rather than @@ -1187,7 +1210,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -1269,8 +1293,10 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, OBB.replaceInstruction(DepWrite, SI); // Delete the old stores and now-dead instructions that feed them. - deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB); - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); MadeChange = true; // We erased DepWrite and Inst (Loc); start over. @@ -1305,7 +1331,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) - MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB); + MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB, ThrowableInst); return MadeChange; } diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp index 934853507478b..132dfc8f6da10 100644 --- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index ce540683dae26..40c1ba88354f1 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/AtomicOrdering.h" @@ -55,6 +55,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <deque> #include <memory> @@ -906,8 +907,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (!salvageDebugInfo(*Inst)) - replaceDbgUsesWithUndef(Inst); + + salvageDebugInfoOrMarkUndef(*Inst); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e6abf1ceb026d..72512430b3662 100644 --- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp index 4d2eac0451dfc..af223cc837f2c 100644 --- a/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "float2int" #include "llvm/Transforms/Scalar/Float2Int.h" diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 743353eaea225..1e6aab14e7b44 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -64,6 +64,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -112,7 +113,7 @@ static cl::opt<uint32_t> MaxNumDeps( struct llvm::GVN::Expression { uint32_t opcode; - Type *type; + Type *type = nullptr; bool commutative = false; SmallVector<uint32_t, 4> varargs; @@ -173,7 +174,7 @@ struct llvm::gvn::AvailableValue { PointerIntPair<Value *, 2, ValType> Val; /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; + unsigned Offset = 0; static AvailableValue get(Value *V, unsigned Offset = 0) { AvailableValue Res; @@ -237,7 +238,7 @@ struct llvm::gvn::AvailableValue { /// the associated BasicBlock. struct llvm::gvn::AvailableValueInBlock { /// BB - The basic block in question. - BasicBlock *BB; + BasicBlock *BB = nullptr; /// AV - The actual available value AvailableValue AV; @@ -364,6 +365,7 @@ GVN::ValueTable::ValueTable() = default; GVN::ValueTable::ValueTable(const ValueTable &) = default; GVN::ValueTable::ValueTable(ValueTable &&) = default; GVN::ValueTable::~ValueTable() = default; +GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { @@ -1387,6 +1389,59 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ) + return true; + + // Floating point comparisons can be equal, but not equivalent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + Cmp->getFastMathFlags().noNaNs())) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + +static bool impliesEquivalanceIfFalse(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE) + return true; + + // Floating point comparisons can be equal, but not equivelent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE && + Cmp->getFastMathFlags().noNaNs()) || + Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + + static bool hasUsersIn(Value *V, BasicBlock *BB) { for (User *U : V->users()) if (isa<Instruction>(U) && @@ -1451,10 +1506,7 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { // call void @llvm.assume(i1 %cmp) // ret float %load ; will change it to ret float %0 if (auto *CmpI = dyn_cast<CmpInst>(V)) { - if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || - CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || - (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && - CmpI->getFastMathFlags().noNaNs())) { + if (impliesEquivalanceIfTrue(CmpI)) { Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); // Heuristically pick the better replacement -- the choice of heuristic @@ -1481,12 +1533,6 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) return Changed; - // +0.0 and -0.0 compare equal, but do not imply equivalence. Unless we - // can prove equivalence, bail. - if (CmpRHS->getType()->isFloatTy() && - (!isa<ConstantFP>(CmpRHS) || cast<ConstantFP>(CmpRHS)->isZero())) - return Changed; - LLVM_DEBUG(dbgs() << "Replacing dominated uses of " << *CmpLHS << " with " << *CmpRHS << " in block " @@ -1875,27 +1921,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace - // A with B everywhere in the scope. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) + // A with B everywhere in the scope. For floating point operations, we + // have to be careful since equality does not always imply equivalance. + if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) || + (isKnownFalse && impliesEquivalanceIfFalse(Cmp))) Worklist.push_back(std::make_pair(Op0, Op1)); - // Handle the floating point versions of equality comparisons too. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) { - - // Floating point -0.0 and 0.0 compare equal, so we can only - // propagate values if we know that we have a constant and that - // its value is non-zero. - - // FIXME: We should do this optimization if 'no signed zeros' is - // applicable via an instruction-level fast-math-flag or some other - // indicator that relaxed FP semantics are being used. - - if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) - Worklist.push_back(std::make_pair(Op0, Op1)); - } - // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp index c87e41484b132..e1796f6bf05a8 100644 --- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -47,7 +47,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -65,6 +64,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -72,6 +72,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <iterator> @@ -956,7 +957,8 @@ private: if (MoveAccess && NewMemAcc) { // The definition of this ld/st will not change: ld/st hoisting is // legal when the ld/st is not moved past its current definition. - MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End); + MSSAUpdater->moveToPlace(NewMemAcc, DestBB, + MemorySSA::BeforeTerminator); } // Replace all other instructions with Repl with memory access NewMemAcc. @@ -1067,6 +1069,9 @@ private: ++NI; } + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + NumHoisted += NL + NS + NC + NI; NumRemoved += NR; NumLoadsHoisted += NL; @@ -1168,6 +1173,7 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemorySSAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); } }; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 054025755c69a..6d0a4975e2668 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -47,7 +47,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -59,6 +58,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -71,6 +71,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 2697d78095681..a3eba27a4d90c 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -39,7 +39,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/GuardWidening.h" -#include <functional> #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" @@ -53,11 +52,15 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <functional> using namespace llvm; @@ -66,22 +69,6 @@ using namespace llvm; STATISTIC(GuardsEliminated, "Number of eliminated guards"); STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches"); -static cl::opt<bool> WidenFrequentBranches( - "guard-widening-widen-frequent-branches", cl::Hidden, - cl::desc("Widen conditions of explicit branches into dominating guards in " - "case if their taken frequency exceeds threshold set by " - "guard-widening-frequent-branch-threshold option"), - cl::init(false)); - -static cl::opt<unsigned> FrequentBranchThreshold( - "guard-widening-frequent-branch-threshold", cl::Hidden, - cl::desc("When WidenFrequentBranches is set to true, this option is used " - "to determine which branches are frequently taken. The criteria " - "that a branch is taken more often than " - "((FrequentBranchThreshold - 1) / FrequentBranchThreshold), then " - "it is considered frequently taken"), - cl::init(1000)); - static cl::opt<bool> WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden, cl::desc("Whether or not we should widen guards " @@ -97,15 +84,16 @@ static Value *getCondition(Instruction *I) { "Bad guard intrinsic?"); return GI->getArgOperand(0); } - if (isGuardAsWidenableBranch(I)) { - auto *Cond = cast<BranchInst>(I)->getCondition(); - return cast<BinaryOperator>(Cond)->getOperand(0); - } + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB)) + return Cond; + return cast<BranchInst>(I)->getCondition(); } // Set the condition for \p I to \p NewCond. \p I can either be a guard or a -// conditional branch. +// conditional branch. static void setCondition(Instruction *I, Value *NewCond) { if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { assert(GI->getIntrinsicID() == Intrinsic::experimental_guard && @@ -126,7 +114,6 @@ class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; LoopInfo &LI; - BranchProbabilityInfo *BPI; /// Together, these describe the region of interest. This might be all of /// the blocks within a function, or only a given loop's blocks and preheader. @@ -271,26 +258,22 @@ class GuardWideningImpl { void widenGuard(Instruction *ToWiden, Value *NewCondition, bool InvertCondition) { Value *Result; + widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result, InvertCondition); - Value *WidenableCondition = nullptr; if (isGuardAsWidenableBranch(ToWiden)) { - auto *Cond = cast<BranchInst>(ToWiden)->getCondition(); - WidenableCondition = cast<BinaryOperator>(Cond)->getOperand(1); + setWidenableBranchCond(cast<BranchInst>(ToWiden), Result); + return; } - if (WidenableCondition) - Result = BinaryOperator::CreateAnd(Result, WidenableCondition, - "guard.chk", ToWiden); setCondition(ToWiden, Result); } public: explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, - LoopInfo &LI, BranchProbabilityInfo *BPI, - DomTreeNode *Root, + LoopInfo &LI, DomTreeNode *Root, std::function<bool(BasicBlock*)> BlockFilter) - : DT(DT), PDT(PDT), LI(LI), BPI(BPI), Root(Root), BlockFilter(BlockFilter) + : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {} /// The entry point for this pass. @@ -309,13 +292,6 @@ static bool isSupportedGuardInstruction(const Instruction *Insn) { bool GuardWideningImpl::run() { DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock; bool Changed = false; - Optional<BranchProbability> LikelyTaken = None; - if (WidenFrequentBranches && BPI) { - unsigned Threshold = FrequentBranchThreshold; - assert(Threshold > 0 && "Zero threshold makes no sense!"); - LikelyTaken = BranchProbability(Threshold - 1, Threshold); - } - for (auto DFI = df_begin(Root), DFE = df_end(Root); DFI != DFE; ++DFI) { auto *BB = (*DFI)->getBlock(); @@ -330,17 +306,6 @@ bool GuardWideningImpl::run() { for (auto *II : CurrentList) Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock); - if (WidenFrequentBranches && BPI) - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) - if (BI->isConditional()) { - // If one of branches of a conditional is likely taken, try to - // eliminate it. - if (BPI->getEdgeProbability(BB, 0U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock); - else if (BPI->getEdgeProbability(BB, 1U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock, - /*InvertCondition*/true); - } } assert(EliminatedGuardsAndBranches.empty() || Changed); @@ -805,10 +770,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = AM.getCachedResult<BranchProbabilityAnalysis>(F); - if (!GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run()) return PreservedAnalyses::all(); @@ -820,22 +782,13 @@ PreservedAnalyses GuardWideningPass::run(Function &F, PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function &F = *L.getHeader()->getParent(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(F); - BasicBlock *RootBB = L.getLoopPredecessor(); if (!RootBB) RootBB = L.getHeader(); auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L.contains(BB); }; - if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, BPI, - AR.DT.getNode(RootBB), + if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB), BlockFilter).run()) return PreservedAnalyses::all(); @@ -856,10 +809,7 @@ struct GuardWideningLegacyPass : public FunctionPass { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run(); } @@ -868,8 +818,6 @@ struct GuardWideningLegacyPass : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); } }; @@ -895,16 +843,11 @@ struct LoopGuardWideningLegacyPass : public LoopPass { auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L->contains(BB); }; - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, PDT, LI, BPI, + return GuardWideningImpl(DT, PDT, LI, DT.getNode(RootBB), BlockFilter).run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); AU.setPreservesCFG(); getLoopAnalysisUsage(AU); AU.addPreserved<PostDominatorTreeWrapperPass>(); @@ -920,8 +863,6 @@ INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", false, false) @@ -931,8 +872,6 @@ INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening", "Widen guards (within a single loop, as a loop pass)", false, false) diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 5519a00c12c97..d8d7acae5c9fd 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,8 +31,8 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -44,7 +44,6 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" @@ -68,6 +67,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -79,6 +79,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include <cassert> @@ -125,10 +126,9 @@ DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), cl::desc("Disable Linear Function Test Replace optimization")); static cl::opt<bool> -LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(false), +LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), cl::desc("Predicate conditions in read only loops")); - namespace { struct RewritePhi; @@ -2663,11 +2663,11 @@ static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE, // merge the max and exact information to approximate a version of // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. SmallVector<const SCEV*, 4> ExitCounts; - const SCEV *MaxConstEC = SE.getConstantMaxBackedgeTakenCount(L); - if (!isa<SCEVCouldNotCompute>(MaxConstEC)) - ExitCounts.push_back(MaxConstEC); for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + ExitCount = SE.getExitCount(L, ExitingBB, + ScalarEvolution::ConstantMaximum); if (!isa<SCEVCouldNotCompute>(ExitCount)) { assert(DT.dominates(ExitingBB, L->getLoopLatch()) && "We should only have known counts for exiting blocks that " @@ -2835,6 +2835,10 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { !isSafeToExpand(ExactBTC, *SE)) return Changed; + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExactBTC->getType()->isIntegerTy()) + return Changed; + auto BadExit = [&](BasicBlock *ExitingBB) { // If our exiting block exits multiple loops, we can only rewrite the // innermost one. Otherwise, we're changing how many times the innermost @@ -2865,6 +2869,10 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { !isSafeToExpand(ExitCount, *SE)) return true; + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExitCount->getType()->isIntegerTy()) + return true; + return false; }; @@ -2936,7 +2944,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // varying check. Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); IRBuilder<> B(L->getLoopPreheader()->getTerminator()); - Value *ExactBTCV = nullptr; //lazy generated if needed + Value *ExactBTCV = nullptr; // Lazily generated if needed. for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); @@ -2991,15 +2999,15 @@ bool IndVarSimplify::run(Loop *L) { if (!L->isLoopSimplifyForm()) return false; - // If there are any floating-point recurrences, attempt to - // transform them to use integer recurrences. - Changed |= rewriteNonIntegerIVs(L); - #ifndef NDEBUG // Used below for a consistency check only const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); #endif + // If there are any floating-point recurrences, attempt to + // transform them to use integer recurrences. + Changed |= rewriteNonIntegerIVs(L); + // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG @@ -3026,11 +3034,19 @@ bool IndVarSimplify::run(Loop *L) { NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); // Try to eliminate loop exits based on analyzeable exit counts - Changed |= optimizeLoopExits(L, Rewriter); + if (optimizeLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } // Try to form loop invariant tests for loop exits by changing how many // iterations of the loop run when that is unobservable. - Changed |= predicateLoopExits(L, Rewriter); + if (predicateLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } // If we have a trip count expression, rewrite the loop's exit condition // using it. @@ -3118,7 +3134,8 @@ bool IndVarSimplify::run(Loop *L) { "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's - // ability to compute trip count. + // ability to compute trip count. We may have *changed* the exit count, but + // only by reducing it. #ifndef NDEBUG if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { SE->forgetLoop(L); @@ -3130,7 +3147,8 @@ bool IndVarSimplify::run(Loop *L) { else BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, NewBECount->getType()); - assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV"); + assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount, + NewBECount) && "indvars must preserve SCEV"); } #endif diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 997d68838152f..58469749600e5 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -74,6 +74,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index e7e73a132fbec..dfb1b6bfb7398 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -141,11 +141,11 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { - const TargetTransformInfo *TTI; + const TargetTransformInfo *TTI = nullptr; /// Target specific address space which uses of should be replaced if /// possible. - unsigned FlatAddrSpace; + unsigned FlatAddrSpace = 0; public: static char ID; @@ -791,8 +791,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); if (auto *MSI = dyn_cast<MemSetInst>(MI)) { - B.CreateMemSet(NewV, MSI->getValue(), - MSI->getLength(), MSI->getDestAlignment(), + B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), + MaybeAlign(MSI->getDestAlignment()), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { @@ -808,15 +808,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, if (isa<MemCpyInst>(MTI)) { MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); - B.CreateMemCpy(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, TBAAStruct, ScopeMD, NoAliasMD); } else { assert(isa<MemMoveInst>(MTI)); - B.CreateMemMove(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp index ec28f790f2522..e8bbf2936da65 100644 --- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 0cf00baaa24a0..98c2fcb3dae0f 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -305,14 +306,13 @@ bool JumpThreading::runOnFunction(Function &F) { DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; - bool HasProfileData = F.hasProfileData(); - if (HasProfileData) { + if (F.hasProfileData()) { LoopInfo LI{DominatorTree(F)}; BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, HasProfileData, + bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; @@ -339,7 +339,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, HasProfileData, + bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (!Changed) @@ -1002,49 +1002,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // successor, merge the blocks. This encourages recursive jump threading // because now the condition in this block can be threaded through // predecessors of our predecessor block. - if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - const Instruction *TI = SinglePred->getTerminator(); - if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 && - SinglePred != BB && !hasAddressTakenAndUsed(BB)) { - // If SinglePred was a loop header, BB becomes one. - if (LoopHeaders.erase(SinglePred)) - LoopHeaders.insert(BB); - - LVI->eraseBlock(SinglePred); - MergeBasicBlockIntoOnlyPred(BB, DTU); - - // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by - // BB code within one basic block `BB`), we need to invalidate the LVI - // information associated with BB, because the LVI information need not be - // true for all of BB after the merge. For example, - // Before the merge, LVI info and code is as follows: - // SinglePred: <LVI info1 for %p val> - // %y = use of %p - // call @exit() // need not transfer execution to successor. - // assume(%p) // from this point on %p is true - // br label %BB - // BB: <LVI info2 for %p val, i.e. %p is true> - // %x = use of %p - // br label exit - // - // Note that this LVI info for blocks BB and SinglPred is correct for %p - // (info2 and info1 respectively). After the merge and the deletion of the - // LVI info1 for SinglePred. We have the following code: - // BB: <LVI info2 for %p val> - // %y = use of %p - // call @exit() - // assume(%p) - // %x = use of %p <-- LVI info2 is correct from here onwards. - // br label exit - // LVI info2 for BB is incorrect at the beginning of BB. - - // Invalidate LVI information for BB if the LVI is not provably true for - // all of BB. - if (!isGuaranteedToTransferExecutionToSuccessor(BB)) - LVI->eraseBlock(BB); - return true; - } - } + if (MaybeMergeBasicBlockIntoOnlyPred(BB)) + return true; if (TryToUnfoldSelectInCurrBB(BB)) return true; @@ -1758,7 +1717,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, getSuccessor(GetBestDestForJumpOnUndef(BB)); // Ok, try to thread it! - return ThreadEdge(BB, PredsToFactor, MostPopularDest); + return TryThreadEdge(BB, PredsToFactor, MostPopularDest); } /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on @@ -1920,12 +1879,146 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, } } -/// ThreadEdge - We have decided that it is safe and profitable to factor the -/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB -/// across BB. Transform the IR to reflect this change. -bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, - const SmallVectorImpl<BasicBlock *> &PredBBs, - BasicBlock *SuccBB) { +/// Merge basic block BB into its sole predecessor if possible. +bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { + BasicBlock *SinglePred = BB->getSinglePredecessor(); + if (!SinglePred) + return false; + + const Instruction *TI = SinglePred->getTerminator(); + if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 || + SinglePred == BB || hasAddressTakenAndUsed(BB)) + return false; + + // If SinglePred was a loop header, BB becomes one. + if (LoopHeaders.erase(SinglePred)) + LoopHeaders.insert(BB); + + LVI->eraseBlock(SinglePred); + MergeBasicBlockIntoOnlyPred(BB, DTU); + + // Now that BB is merged into SinglePred (i.e. SinglePred code followed by + // BB code within one basic block `BB`), we need to invalidate the LVI + // information associated with BB, because the LVI information need not be + // true for all of BB after the merge. For example, + // Before the merge, LVI info and code is as follows: + // SinglePred: <LVI info1 for %p val> + // %y = use of %p + // call @exit() // need not transfer execution to successor. + // assume(%p) // from this point on %p is true + // br label %BB + // BB: <LVI info2 for %p val, i.e. %p is true> + // %x = use of %p + // br label exit + // + // Note that this LVI info for blocks BB and SinglPred is correct for %p + // (info2 and info1 respectively). After the merge and the deletion of the + // LVI info1 for SinglePred. We have the following code: + // BB: <LVI info2 for %p val> + // %y = use of %p + // call @exit() + // assume(%p) + // %x = use of %p <-- LVI info2 is correct from here onwards. + // br label exit + // LVI info2 for BB is incorrect at the beginning of BB. + + // Invalidate LVI information for BB if the LVI is not provably true for + // all of BB. + if (!isGuaranteedToTransferExecutionToSuccessor(BB)) + LVI->eraseBlock(BB); + return true; +} + +/// Update the SSA form. NewBB contains instructions that are copied from BB. +/// ValueMapping maps old values in BB to new ones in NewBB. +void JumpThreadingPass::UpdateSSA( + BasicBlock *BB, BasicBlock *NewBB, + DenseMap<Instruction *, Value *> &ValueMapping) { + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use *, 16> UsesToRename; + + for (Instruction &I : *BB) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Use &U : I.uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(U) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&U); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + LLVM_DEBUG(dbgs() << "\n"); + } +} + +/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone +/// arguments that come from PredBB. Return the map from the variables in the +/// source basic block to the variables in the newly created basic block. +DenseMap<Instruction *, Value *> +JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI, + BasicBlock::iterator BE, BasicBlock *NewBB, + BasicBlock *PredBB) { + // We are going to have to map operands from the source basic block to the new + // copy of the block 'NewBB'. If there are PHI nodes in the source basic + // block, evaluate them to account for entry from PredBB. + DenseMap<Instruction *, Value *> ValueMapping; + + // Clone the phi nodes of the source basic block into NewBB. The resulting + // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater + // might need to rewrite the operand of the cloned phi. + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { + PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); + NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); + ValueMapping[PN] = NewPN; + } + + // Clone the non-phi instructions of the source basic block into NewBB, + // keeping track of the mapping and using it to remap operands in the cloned + // instructions. + for (; BI != BE; ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[&*BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + return ValueMapping; +} + +/// TryThreadEdge - Thread an edge if it's safe and profitable to do so. +bool JumpThreadingPass::TryThreadEdge( + BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() @@ -1955,6 +2048,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, return false; } + ThreadEdge(BB, PredBBs, SuccBB); + return true; +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +void JumpThreadingPass::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { + assert(SuccBB != BB && "Don't create an infinite loop"); + + assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && + "Don't thread across loop headers"); + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) @@ -1968,7 +2076,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // And finally, do it! LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" << SuccBB->getName() - << "' with cost: " << JumpThreadCost << ", across block:\n " << *BB << "\n"); if (DTU->hasPendingDomTreeUpdates()) @@ -1977,11 +2084,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, LVI->enableDT(); LVI->threadEdge(PredBB, BB, SuccBB); - // We are going to have to map operands from the original BB block to the new - // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to - // account for entry from PredBB. - DenseMap<Instruction*, Value*> ValueMapping; - BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+".thread", BB->getParent(), BB); @@ -1994,32 +2096,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } - BasicBlock::iterator BI = BB->begin(); - // Clone the phi nodes of BB into NewBB. The resulting phi nodes are trivial, - // since NewBB only has one predecessor, but SSAUpdater might need to rewrite - // the operand of the cloned phi. - for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { - PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); - NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); - ValueMapping[PN] = NewPN; - } - - // Clone the non-phi instructions of BB into NewBB, keeping track of the - // mapping and using it to remap operands in the cloned instructions. - for (; !BI->isTerminator(); ++BI) { - Instruction *New = BI->clone(); - New->setName(BI->getName()); - NewBB->getInstList().push_back(New); - ValueMapping[&*BI] = New; - - // Remap operands to patch up intra-block references. - for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) - if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { - DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); - if (I != ValueMapping.end()) - New->setOperand(i, I->second); - } - } + // Copy all the instructions from BB to NewBB except the terminator. + DenseMap<Instruction *, Value *> ValueMapping = + CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. @@ -2045,44 +2124,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, {DominatorTree::Insert, PredBB, NewBB}, {DominatorTree::Delete, PredBB, BB}}); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if their uses are no longer - // dominated by the previous def and if so, record them in UsesToRename. - // Also, skip phi operands from PredBB - we'll remove them anyway. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, NewBB, ValueMapping); // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This @@ -2094,7 +2136,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // Threaded an edge! ++NumThreads; - return true; } /// Create a new basic block that will be the predecessor of BB and successor of @@ -2366,43 +2407,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, ValueMapping); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if it is used outside of its - // block, and if so, record them in UsesToRename. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, PredBB, ValueMapping); // PredBB no longer jumps to BB, remove entries in the PHI node for the edge // that we nuked. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 6ce4831a73592..8c33045c23802 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/PredIteratorCache.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -137,7 +138,8 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, TargetTransformInfo *TTI, bool &FreeInLoop); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); @@ -162,7 +164,7 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE); namespace { struct LoopInvariantCodeMotion { @@ -390,8 +392,9 @@ bool LoopInvariantCodeMotion::runOnLoop( CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); + Changed |= + hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -787,6 +790,41 @@ public: }; } // namespace + +/// Return true if we know how to rewrite all uses of the given alloca after +/// hoisting it out of the loop. The main concerns are a) potential captures +/// and b) invariant.start markers which don't capture, but are no longer +/// valid w/o a corresponding invariant.end. +static bool canRewriteUsesOfAlloca(AllocaInst &AI) { + // TODO: This looks a lot like capture tracking, but we need to remove any + // invariant starts if we extend the lifetime of the alloca by hoisting it. + // We should probably refactor capture tracking into a form which allows us + // to reuse the relevant bits and remove the duplicated logic here. + + SmallVector<Use *, 16> Worklist; + for (Use &U : AI.uses()) + Worklist.push_back(&U); + + unsigned NumUsesExplored = 0; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + NumUsesExplored++; + if (NumUsesExplored > DefaultMaxUsesToExplore) + return false; + // Non capturing, terminating uses + if (isa<LoadInst>(I) || + (isa<StoreInst>(I) && U->getOperandNo() == 1)) + continue; + // Non capturing, non-terminating + if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I)) + return false; + for (Use &U : I->uses()) + Worklist.push_back(&U); + } + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -795,7 +833,7 @@ public: bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, + ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { // Verify inputs. @@ -855,7 +893,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -882,7 +920,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, ORE); + SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(ReciprocalDivisor); Changed = true; continue; @@ -901,7 +939,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop->hasLoopInvariantOperands(&I) && MustExecuteWithoutWritesBefore(I)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } + + if (isa<AllocaInst>(&I) && + SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + canRewriteUsesOfAlloca(cast<AllocaInst>(I))) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -915,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, PN->setIncomingBlock( i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); Changed = true; continue; @@ -952,7 +1000,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, LLVM_DEBUG(dbgs() << "LICM rehoisting to " << HoistPoint->getParent()->getName() << ": " << *I << "\n"); - moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU); + moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE); HoistPoint = I; Changed = true; } @@ -1142,6 +1190,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // Assumes don't actually alias anything or throw return true; + if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) + // Widenable conditions don't actually alias anything or throw + return true; + // Handle simple cases by querying alias analysis. FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); if (Behavior == FMRB_DoesNotAccessMemory) @@ -1441,14 +1493,18 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, + ScalarEvolution *SE) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest.getParent()); I.moveBefore(&Dest); if (MSSAU) if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( MSSAU->getMemorySSA()->getMemoryAccess(&I))) - MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::End); + MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), + MemorySSA::BeforeTerminator); + if (SE) + SE->forgetValue(&I); } static Instruction *sinkThroughTriviallyReplaceablePHI( @@ -1662,7 +1718,8 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, /// static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I << "\n"); ORE->emit([&]() { @@ -1683,10 +1740,10 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, if (isa<PHINode>(I)) // Move the new node to the end of the phi list in the destination block. - moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE); else // Move the new node to the destination block, before its terminator. - moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE); // Apply line 0 debug locations when we are moving instructions to different // basic blocks because we want to avoid jumpy line tables. diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index a972d6fa2fcdf..ab65f56d088f7 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/InitializePasses.h" #define DEBUG_TYPE "loop-data-prefetch" #include "llvm/ADT/DepthFirstIterator.h" diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index cee197cf83540..2451572d61715 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index f45e5fd0f50b4..8e04e6e0ffe83 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 9f93c68e6128d..e1738f08eb238 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -55,12 +55,15 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Function.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/CodeMoverUtils.h" using namespace llvm; @@ -88,6 +91,7 @@ STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); STATISTIC(NonIdenticalGuards, "Candidates have different guards"); STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); +STATISTIC(NotRotated, "Candidate is not rotated"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -163,14 +167,8 @@ struct FusionCandidate { const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), - Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr), - DT(DT), PDT(PDT), ORE(ORE) { - - // TODO: This is temporary while we fuse both rotated and non-rotated - // loops. Once we switch to only fusing rotated loops, the initialization of - // GuardBranch can be moved into the initialization list above. - if (isRotated()) - GuardBranch = L->getLoopGuardBranch(); + Latch(L->getLoopLatch()), L(L), Valid(true), + GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) { // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -257,15 +255,14 @@ struct FusionCandidate { : GuardBranch->getSuccessor(0); } - bool isRotated() const { - assert(L && "Expecting loop to be valid."); - assert(Latch && "Expecting latch to be valid."); - return L->isLoopExiting(Latch); - } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const { - dbgs() << "\tGuardBranch: " + dbgs() << "\tGuardBranch: "; + if (GuardBranch) + dbgs() << *GuardBranch; + else + dbgs() << "nullptr"; + dbgs() << "\n" << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") << "\n" @@ -316,6 +313,11 @@ struct FusionCandidate { return reportInvalidCandidate(NotSimplifiedForm); } + if (!L->isRotatedForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); + return reportInvalidCandidate(NotRotated); + } + return true; } @@ -591,16 +593,8 @@ private: const FusionCandidate &FC1) const { assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); - BasicBlock *FC0EntryBlock = FC0.getEntryBlock(); - BasicBlock *FC1EntryBlock = FC1.getEntryBlock(); - - if (DT.dominates(FC0EntryBlock, FC1EntryBlock)) - return PDT.dominates(FC1EntryBlock, FC0EntryBlock); - - if (DT.dominates(FC1EntryBlock, FC0EntryBlock)) - return PDT.dominates(FC0EntryBlock, FC1EntryBlock); - - return false; + return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), + DT, PDT); } /// Iterate over all loops in the given loop set and identify the loops that @@ -1113,6 +1107,29 @@ private: return FC.ExitBlock->size() == 1; } + /// Simplify the condition of the latch branch of \p FC to true, when both of + /// its successors are the same. + void simplifyLatchBranch(const FusionCandidate &FC) const { + BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); + if (FCLatchBranch) { + assert(FCLatchBranch->isConditional() && + FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && + "Expecting the two successors of FCLatchBranch to be the same"); + FCLatchBranch->setCondition( + llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType())); + } + } + + /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique + /// successor, then merge FC0.Latch with its unique successor. + void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { + moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI); + if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { + MergeBlockIntoPredecessor(Succ, &DTU, &LI); + DTU.flush(); + } + } + /// Fuse two fusion candidates, creating a new fused loop. /// /// This method contains the mechanics of fusing two loops, represented by \p @@ -1246,6 +1263,10 @@ private: FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + // If FC0.Latch and FC0.ExitingBlock are the same then we have already // performed the updates above. if (FC0.Latch != FC0.ExitingBlock) @@ -1268,9 +1289,15 @@ private: // Is there a way to keep SE up-to-date so we don't need to forget the loops // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), FC1.L->block_end()); @@ -1490,6 +1517,10 @@ private: FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + // If FC0.Latch and FC0.ExitingBlock are the same then we have already // performed the updates above. if (FC0.Latch != FC0.ExitingBlock) @@ -1521,9 +1552,15 @@ private: // Is there a way to keep SE up-to-date so we don't need to forget the loops // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), FC1.L->block_end()); diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index dd477e8006937..b77843d7cd711 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -41,7 +41,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -78,20 +77,17 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" -#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -107,7 +103,6 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); -STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare"); static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -117,26 +112,6 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( namespace { -// FIXME: reinventing the wheel much? Is there a cleaner solution? -struct PMAbstraction { - virtual void markLoopAsDeleted(Loop *L) = 0; - virtual ~PMAbstraction() = default; -}; -struct LegacyPMAbstraction : PMAbstraction { - LPPassManager &LPM; - LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {} - virtual ~LegacyPMAbstraction() = default; - void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); } -}; -struct NewPMAbstraction : PMAbstraction { - LPMUpdater &Updater; - NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {} - virtual ~NewPMAbstraction() = default; - void markLoopAsDeleted(Loop *L) override { - Updater.markLoopAsDeleted(*L, L->getName()); - } -}; - class LoopIdiomRecognize { Loop *CurLoop = nullptr; AliasAnalysis *AA; @@ -146,7 +121,6 @@ class LoopIdiomRecognize { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; - PMAbstraction &LoopDeleter; OptimizationRemarkEmitter &ORE; bool ApplyCodeSizeHeuristics; @@ -155,10 +129,9 @@ public: LoopInfo *LI, ScalarEvolution *SE, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const DataLayout *DL, PMAbstraction &LoopDeleter, + const DataLayout *DL, OptimizationRemarkEmitter &ORE) - : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), - LoopDeleter(LoopDeleter), ORE(ORE) {} + : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {} bool runOnLoop(Loop *L); @@ -172,8 +145,6 @@ private: bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; - bool HasMemCmp; - bool HasBCmp; /// Return code for isLegalStore() enum LegalStoreKind { @@ -201,7 +172,7 @@ private: bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, + MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, @@ -216,32 +187,6 @@ private: bool runOnNoncountableLoop(); - struct CmpLoopStructure { - Value *BCmpValue, *LatchCmpValue; - BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB; - BasicBlock *LatchBrFinishBB, *LatchBrContinueBB; - }; - bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const; - struct CmpOfLoads { - ICmpInst::Predicate BCmpPred; - Value *LoadSrcA, *LoadSrcB; - Value *LoadA, *LoadB; - }; - bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const; - bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads, - CmpLoopStructure &CmpLoop) const; - bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads, - const SCEV *&SrcA, const SCEV *&SrcB, - const SCEV *&Iterations) const; - bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst, - LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA, - const SCEV *&SrcB, const SCEV *&NBytes) const; - BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual); - void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst, - LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA, - const SCEV *SrcB, const SCEV *NBytes); - bool recognizeBCmp(); - bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); @@ -279,14 +224,13 @@ public: &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()); const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); - LegacyPMAbstraction LoopDeleter(LPM); // For the old PM, we can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); - LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE); + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE); return LIR.runOnLoop(L); } @@ -305,7 +249,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0; PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &Updater) { + LPMUpdater &) { const auto *DL = &L.getHeader()->getModule()->getDataLayout(); const auto &FAM = @@ -319,9 +263,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached " "at a higher level"); - NewPMAbstraction LoopDeleter(Updater); LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, - LoopDeleter, *ORE); + *ORE); if (!LIR.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -358,8 +301,7 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy" || Name == "memcmp" || - Name == "bcmp") + if (Name == "memset" || Name == "memcpy") return false; // Determine if code size heuristics need to be applied. @@ -369,10 +311,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemset = TLI->has(LibFunc_memset); HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - HasMemCmp = TLI->has(LibFunc_memcmp); - HasBCmp = TLI->has(LibFunc_bcmp); - if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp) + if (HasMemset || HasMemsetPattern || HasMemcpy) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -791,7 +731,8 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, bool NegStride = StoreSize == -Stride; - if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + if (processLoopStridedStore(StorePtr, StoreSize, + MaybeAlign(HeadStore->getAlignment()), StoredVal, HeadStore, AdjacentStores, StoreEv, BECount, NegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); @@ -846,9 +787,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet<Instruction *, 1> MSIs; MSIs.insert(MSI); bool NegStride = SizeInBytes == -Stride; - return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getDestAlignment(), SplatValue, MSI, MSIs, - Ev, BECount, NegStride, /*IsLoopMemset=*/true); + return processLoopStridedStore( + Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), + SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -938,7 +879,7 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( - Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset) { @@ -960,12 +901,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); - Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); const SCEV *Start = Ev->getStart(); // Handle negative strided loops. if (NegStride) - Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); + Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -993,7 +934,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, everything looks good, insert the memset. const SCEV *NumBytesS = - getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -1001,12 +942,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( return false; Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, + MaybeAlign(StoreAlignment)); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; @@ -1014,7 +955,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Module *M = TheStore->getModule(); StringRef FuncName = "memset_pattern16"; FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntPtr); + Int8PtrTy, Int8PtrTy, IntIdxTy); inferLibFuncAttributes(M, FuncName, *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1081,11 +1022,11 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *StrStart = StoreEv->getStart(); unsigned StrAS = SI->getPointerAddressSpace(); - Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); // Handle negative strided loops. if (NegStride) - StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); + StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1111,7 +1052,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Handle negative strided loops. if (NegStride) - LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. @@ -1133,18 +1074,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, everything is safe, we can transform this! const SCEV *NumBytesS = - getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must necessarily be unordered // by previous checks. if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), - LoadBasePtr, LI->getAlignment(), NumBytes); + NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr, + LI->getAlign(), NumBytes); else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. @@ -1211,7 +1152,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << "] Noncountable Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS(); + return recognizePopcount() || recognizeAndInsertFFS(); } /// Check if the given conditional branch is based on the comparison between @@ -1885,811 +1826,3 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, // loop. The loop would otherwise not be deleted even if it becomes empty. SE->forgetLoop(CurLoop); } - -bool LoopIdiomRecognize::matchBCmpLoopStructure( - CmpLoopStructure &CmpLoop) const { - ICmpInst::Predicate BCmpPred; - - // We are looking for the following basic layout: - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB - // <...> - // %BCmpValue = icmp <...> - // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0 - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // %LatchCmpValue = <are we done, or do next iteration?> - // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB - // Successor0: <exit> ; preds = %LoopHeaderBB - // <...> - // Successor1: <exit> ; preds = %LoopLatchBB - // <...> - // - // Successor0 and Successor1 may or may not be the same basic block. - - // Match basic frame-work of this supposedly-comparison loop. - using namespace PatternMatch; - if (!match(CurLoop->getHeader()->getTerminator(), - m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()), - m_Value(CmpLoop.BCmpValue)), - CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) || - !match(CurLoop->getLoopLatch()->getTerminator(), - m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)), - CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) { - LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n"); - return true; -} - -bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue, - CmpOfLoads &CmpOfLoads) const { - using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue - << " as bcmp pattern.\n"); - - // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example: - // %v0 = load <...>, <...>* %LoadSrcA - // %v1 = load <...>, <...>* %LoadSrcB - // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1 - // There won't be any no-op bitcasts between load and icmp, - // they would have been transformed into a load of bitcast. - // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too. - if (!match(BCmpValue, - m_ICmp(CmpOfLoads.BCmpPred, - m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)), - m_Value(CmpOfLoads.LoadA)), - m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)), - m_Value(CmpOfLoads.LoadB)))) || - !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) { - LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t" - << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB - << "\n"); - // FIXME: handle memcmp pattern? - return true; -} - -bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow( - const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const { - BasicBlock *LoopHeaderBB = CurLoop->getHeader(); - BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); - - // Be wary, comparisons can be inverted, canonicalize order. - // If this 'element' comparison passed, we expect to proceed to the next elt. - if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ) - std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB); - // The predicate on loop latch does not matter, just canonicalize some order. - if (CmpLoop.LatchBrContinueBB != LoopHeaderBB) - std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB); - - SmallVector<BasicBlock *, 2> ExitBlocks; - - CurLoop->getUniqueExitBlocks(ExitBlocks); - assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks."); - - // Check that control-flow between blocks is as expected. - if (CmpLoop.HeaderBrEqualBB != LoopLatchBB || - CmpLoop.LatchBrContinueBB != LoopHeaderBB || - !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) || - !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) { - LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n"); - return false; - } - - assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) && - !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) && - "Unexpected exit edges."); - - LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n"); - - LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n"); - assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here."); - // No loop instructions must be used outside of the loop. Since we are in - // LCSSA form, we only need to check successor block's PHI nodes's incoming - // values for incoming blocks that are the loop basic blocks. - for (const BasicBlock *ExitBB : ExitBlocks) { - for (const PHINode &PHI : ExitBB->phis()) { - for (const BasicBlock *LoopBB : - make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) { - return CurLoop->contains(PredecessorBB); - })) { - const auto *I = - dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB)); - if (I && CurLoop->contains(I)) { - LLVM_DEBUG(dbgs() - << "Loop contains instruction " << *I - << " which is used outside of the loop in basic block " - << ExitBB->getName() << " in phi node " << PHI << "\n"); - return false; - } - } - } - } - // Similarly, the loop should not have any other observable side-effects - // other than the final comparison result. - for (BasicBlock *LoopBB : CurLoop->blocks()) { - for (Instruction &I : *LoopBB) { - if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo. - continue; // FIXME: anything else? lifetime info? - if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) && - &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) { - LLVM_DEBUG( - dbgs() << "Loop contains instruction with potential side-effects: " - << I << "\n"); - return false; - } - } - } - LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n"); - return true; -} - -bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, - CmpOfLoads &CmpOfLoads, - const SCEV *&SrcA, - const SCEV *&SrcB, - const SCEV *&Iterations) const { - // Try to compute SCEV of the loads, for this loop's scope. - const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>( - SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop)); - const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>( - SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop)); - if (!ScevForSrcA || !ScevForSrcB) { - LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t" - << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n"); - - // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB> - const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE); - const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE); - if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() || - ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop || - RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) || - cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support " - "affine SCEV expressions originating in the loop we " - "are analysing with identical constant positive step, " - "equal to the count of bytes compared. Got:\n\t" - << *RecStepForA << "\n\t" << *RecStepForB << "\n"); - return false; - // FIXME: can support BCmpTyBytes > Step. - // But will need to account for the extra bytes compared at the end. - } - - SrcA = ScevForSrcA->getStart(); - SrcB = ScevForSrcB->getStart(); - LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA - << "\n\t" << *SrcB << "\n"); - - // The load sources must be loop-invants that dominate the loop header. - if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() || - !SE->isAvailableAtLoopEntry(SrcA, CurLoop) || - !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable " - "prior to loop header.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n"); - - // bcmp / memcmp take length argument as size_t, so let's conservatively - // assume that the iteration count should be not wider than that. - Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext()); - - // For how many iterations is loop guaranteed not to exit via LoopLatch? - // This is one less than the maximal number of comparisons,and is: n + -1 - const SCEV *LoopExitCount = - SE->getExitCount(CurLoop, CurLoop->getLoopLatch()); - LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: " - << *LoopExitCount << "\n"); - // Exit count, similarly, must be loop-invant that dominates the loop header. - if (LoopExitCount == SE->getCouldNotCompute() || - !LoopExitCount->getType()->isIntOrPtrTy() || - LoopExitCount->getType()->getScalarSizeInBits() > - CmpFuncSizeTy->getScalarSizeInBits() || - !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n"); - return false; - } - - // LoopExitCount is always one less than the actual count of iterations. - // Do this before cast, else we will be stuck with 1 + zext(-1 + n) - Iterations = SE->getAddExpr( - LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW); - assert(Iterations != SE->getCouldNotCompute() && - "Shouldn't fail to increment by one."); - - LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n"); - return true; -} - -/// Return true iff the bcmp idiom is detected in the loop. -/// -/// Additionally: -/// 1) \p BCmpInst is set to the root byte-comparison instruction. -/// 2) \p LatchCmpInst is set to the comparison that controls the latch. -/// 3) \p LoadA is set to the first LoadInst. -/// 4) \p LoadB is set to the second LoadInst. -/// 5) \p SrcA is set to the first source location that is being compared. -/// 6) \p SrcB is set to the second source location that is being compared. -/// 7) \p NBytes is set to the number of bytes to compare. -bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst, - CmpInst *&LatchCmpInst, - LoadInst *&LoadA, LoadInst *&LoadB, - const SCEV *&SrcA, const SCEV *&SrcB, - const SCEV *&NBytes) const { - LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n"); - - // Give up if the loop is not in normal form, or has more than 2 blocks. - if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) { - LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n"); - - CmpLoopStructure CmpLoop; - if (!matchBCmpLoopStructure(CmpLoop)) - return false; - - CmpOfLoads CmpOfLoads; - if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads)) - return false; - - if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop)) - return false; - - BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no - LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine - LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with - LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher? - - Type *BCmpValTy = BCmpInst->getOperand(0)->getType(); - LLVMContext &Context = BCmpValTy->getContext(); - uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy); - static constexpr uint64_t ByteTyBits = 8; - - LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy - << " of size " << BCmpTyBits - << " bits (while byte = " << ByteTyBits << " bits).\n"); - // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check - // that we are dealing with a multiple of a byte here. - if (BCmpTyBits % ByteTyBits != 0) { - LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n"); - return false; - // FIXME: could still be done under a run-time check that the total bit - // count is a multiple of a byte i guess? Or handle remainder separately? - } - - // Each comparison is done on this many bytes. - uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits; - LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes - << " bytes, eligible for bcmp conversion.\n"); - - const SCEV *Iterations; - if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations)) - return false; - - // bcmp / memcmp take length argument as size_t, do promotion now. - Type *CmpFuncSizeTy = DL->getIntPtrType(Context); - Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy); - assert(Iterations != SE->getCouldNotCompute() && "Promotion failed."); - // Note that it didn't do ptrtoint cast, we will need to do it manually. - - // We will be comparing *bytes*, not BCmpTy, we need to recalculate size. - // It's a multiplication, and it *could* overflow. But for it to overflow - // we'd want to compare more bytes than could be represented by size_t, But - // allocation functions also take size_t. So how'd you produce such buffer? - // FIXME: we likely need to actually check that we know this won't overflow, - // via llvm::computeOverflowForUnsignedMul(). - NBytes = SE->getMulExpr( - Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW); - assert(NBytes != SE->getCouldNotCompute() && - "Shouldn't fail to increment by one."); - - LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n"); - - if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() || - LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() || - !LoadB->isSimple()) { - StringLiteral L("Unsupported loads in idiom - only support identical, " - "simple loads from address space 0.\n"); - LLVM_DEBUG(dbgs() << L); - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads", - BCmpInst->getDebugLoc(), - CurLoop->getHeader()) - << L; - }); - return false; // FIXME: support non-simple loads. - } - - LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n"); - ORE.emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom", - CurLoop->getStartLoc(), - CurLoop->getHeader()) - << "Loop recognized as a bcmp idiom"; - }); - - return true; -} - -BasicBlock * -LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) { - LLVM_DEBUG(dbgs() << "Transforming control-flow.\n"); - SmallVector<DominatorTree::UpdateType, 8> DTUpdates; - - BasicBlock *PreheaderBB = CurLoop->getLoopPreheader(); - BasicBlock *HeaderBB = CurLoop->getHeader(); - BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); - SmallString<32> LoopName = CurLoop->getName(); - Function *Func = PreheaderBB->getParent(); - LLVMContext &Context = Func->getContext(); - - // Before doing anything, drop SCEV info. - SE->forgetLoop(CurLoop); - - // Here we start with: (0/6) - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %Successor0BB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB - // Successor0BB: <exit> ; preds = %LoopHeaderBB - // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] - // <...> - // Successor1BB: <exit> ; preds = %LoopLatchBB - // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] - // <...> - // - // Successor0 and Successor1 may or may not be the same basic block. - - // Decouple the edge between loop preheader basic block and loop header basic - // block. Thus the loop has become unreachable. - assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() && - PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB && - "Preheader bb must end with an unconditional branch to header bb."); - PreheaderBB->getTerminator()->eraseFromParent(); - DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB}); - - // Create a new preheader basic block before loop header basic block. - auto *PhonyPreheaderBB = BasicBlock::Create( - Context, LoopName + ".phonypreheaderbb", Func, HeaderBB); - // And insert an unconditional branch from phony preheader basic block to - // loop header basic block. - IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB); - DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); - - // Create a *single* new empty block that we will substitute as a - // successor basic block for the loop's exits. This one is temporary. - // Much like phony preheader basic block, it is not connected. - auto *PhonySuccessorBB = - BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func, - LoopLatchBB->getNextNode()); - // That block must have *some* non-PHI instruction, or else deleteDeadLoop() - // will mess up cleanup of dbginfo, and verifier will complain. - IRBuilder<>(PhonySuccessorBB).CreateUnreachable(); - - // Create two new empty blocks that we will use to preserve the original - // loop exit control-flow, and preserve the incoming values in the PHI nodes - // in loop's successor exit blocks. These will live one. - auto *ComparedUnequalBB = - BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func, - PhonySuccessorBB->getNextNode()); - auto *ComparedEqualBB = - BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func, - PhonySuccessorBB->getNextNode()); - - // By now we have: (1/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %Successor0BB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB - // PhonySuccessorBB: ; No preds, UNREACHABLE! - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // [no terminator instruction!] - // UnequalBB: ; No preds, UNREACHABLE! - // [no terminator instruction!] - // Successor0BB: <exit> ; preds = %LoopHeaderBB - // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] - // <...> - // Successor1BB: <exit> ; preds = %LoopLatchBB - // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] - // <...> - - // What is the mapping/replacement basic block for exiting out of the loop - // from either of old's loop basic blocks? - auto GetReplacementBB = [this, ComparedEqualBB, - ComparedUnequalBB](const BasicBlock *OldBB) { - assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks."); - if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal". - return ComparedEqualBB; - if (OldBB == CurLoop->getHeader()) // "element compared unequal". - return ComparedUnequalBB; - llvm_unreachable("Only had two basic blocks in loop."); - }; - - // What are the exits out of this loop? - SmallVector<Loop::Edge, 2> LoopExitEdges; - CurLoop->getExitEdges(LoopExitEdges); - assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges."); - - // Populate new basic blocks, update the exiting control-flow, PHI nodes. - for (const Loop::Edge &Edge : LoopExitEdges) { - auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first); - auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second); - assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) && - "Unexpected edge."); - - // If we would exit the loop from this loop's basic block, - // what semantically would that mean? Did comparison succeed or fail? - BasicBlock *NewBB = GetReplacementBB(OldLoopBB); - assert(NewBB->empty() && "Should not get same new basic block here twice."); - IRBuilder<> Builder(NewBB); - Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc()); - Builder.CreateBr(SuccessorBB); - DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB}); - // Also, be *REALLY* careful with PHI nodes in successor basic block, - // update them to recieve the same input value, but not from current loop's - // basic block, but from new basic block instead. - SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB); - // Also, change loop control-flow. This loop's basic block shall no longer - // exit from the loop to it's original successor basic block, but to our new - // phony successor basic block. Note that new successor will be unique exit. - OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB, - PhonySuccessorBB); - DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB}); - } - - // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date. - assert(DTUpdates.size() == 8 && "Update count prediction failed."); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (2/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB - // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // br label %Successor1BB - // UnequalBB: ; No preds, UNREACHABLE! - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // *Finally*, zap the original loop. Record it's parent loop though. - Loop *ParentLoop = CurLoop->getParentLoop(); - LLVM_DEBUG(dbgs() << "Deleting old loop.\n"); - LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting! - deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop. - CurLoop = nullptr; - - // By now we have: (3/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: ; No preds, UNREACHABLE! - // br label %PhonySuccessorBB - // PhonySuccessorBB: ; preds = %PhonyPreheaderBB - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // br label %Successor1BB - // UnequalBB: ; No preds, UNREACHABLE! - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Now, actually restore the CFG. - - // Insert an unconditional branch from an actual preheader basic block to - // phony preheader basic block. - IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB); - DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); - // Insert proper conditional branch from phony successor basic block to the - // "dispatch" basic blocks, which were used to preserve incoming values in - // original loop's successor basic blocks. - assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) && - "Yep, that's the one we created to keep deleteDeadLoop() happy."); - PhonySuccessorBB->getTerminator()->eraseFromParent(); - { - IRBuilder<> Builder(PhonySuccessorBB); - Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc()); - Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB); - } - DTUpdates.push_back( - {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB}); - DTUpdates.push_back( - {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB}); - - BasicBlock *DispatchBB = PhonySuccessorBB; - DispatchBB->setName(LoopName + ".bcmpdispatchbb"); - - assert(DTUpdates.size() == 3 && "Update count prediction failed."); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (4/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %PhonyPreheaderBB - // PhonyPreheaderBB: ; preds = %PreheaderBB - // br label %DispatchBB - // DispatchBB: ; preds = %PhonyPreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // The basic CFG has been restored! Now let's merge redundant basic blocks. - - // Merge phony successor basic block into it's only predecessor, - // phony preheader basic block. It is fully pointlessly redundant. - MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); - - // By now we have: (5/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %DispatchBB - // DispatchBB: ; preds = %PreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Was this loop nested? - if (!ParentLoop) { - // If the loop was *NOT* nested, then let's also merge phony successor - // basic block into it's only predecessor, preheader basic block. - // Also, here we need to update LoopInfo. - LI->removeBlock(PreheaderBB); - MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); - - // By now we have: (6/6) - // DispatchBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - return DispatchBB; - } - - // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop. - // To achieve that, we shall keep the preheader basic block (mainly so that - // the loop header block will be guaranteed to have a predecessor outside of - // the loop), and create a phony loop with all these new three basic blocks. - Loop *PhonyLoop = LI->AllocateLoop(); - ParentLoop->addChildLoop(PhonyLoop); - PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI); - PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI); - PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI); - - // But we only have a preheader basic block, a header basic block block and - // two exiting basic blocks. For a proper loop we also need a backedge from - // non-header basic block to header bb. - // Let's just add a never-taken branch from both of the exiting basic blocks. - for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) { - BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator()); - assert(OldTerminator->isUnconditional() && "That's the one we created."); - BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0); - - IRBuilder<> Builder(OldTerminator); - Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc()); - Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB, - DispatchBB); - OldTerminator->eraseFromParent(); - // Yes, the backedge will never be taken. The control-flow is redundant. - // If it can be simplified further, other passes will take care. - DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB}); - } - assert(DTUpdates.size() == 6 && "Update count prediction failed."); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (6/6) - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %BCmpDispatchBB - // BCmpDispatchBB: <header> ; preds = %PreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB - // br i1 %true, label %Successor1BB, label %BCmpDispatchBB - // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB - // br i1 %true, label %Successor0BB, label %BCmpDispatchBB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Finally fully DONE! - return DispatchBB; -} - -void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst, - CmpInst *LatchCmpInst, - LoadInst *LoadA, LoadInst *LoadB, - const SCEV *SrcA, const SCEV *SrcB, - const SCEV *NBytes) { - // We will be inserting before the terminator instruction of preheader block. - IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator()); - - LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n"); - LLVM_DEBUG(dbgs() << "Emitting new instructions.\n"); - - // Expand the SCEV expressions for both sources to compare, and produce value - // for the byte len (beware of Iterations potentially being a pointer, and - // account for element size being BCmpTyBytes bytes, which may be not 1 byte) - Value *PtrA, *PtrB, *Len; - { - SCEVExpander SExp(*SE, *DL, "LoopToBCmp"); - SExp.setInsertPoint(&*Builder.GetInsertPoint()); - - auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) { - SExp.SetCurrentDebugLocation(DebugLoc()); - // If the pointer operand of original load had dbgloc - use it. - if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand())) - SExp.SetCurrentDebugLocation(I->getDebugLoc()); - return SExp.expandCodeFor(Src); - }; - PtrA = HandlePtr(LoadA, SrcA); - PtrB = HandlePtr(LoadB, SrcB); - - // For len calculation let's use dbgloc for the loop's latch condition. - Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); - SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); - Len = SExp.expandCodeFor(NBytes); - - Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext()); - assert(SE->getTypeSizeInBits(Len->getType()) == - DL->getTypeSizeInBits(CmpFuncSizeTy) && - "Len should already have the correct size."); - - // Make sure that iteration count is a number, insert ptrtoint cast if not. - if (Len->getType()->isPointerTy()) - Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy); - assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now."); - - Len->setName(Len->getName() + ".bytecount"); - - // There is no legality check needed. We want to compare that the memory - // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal. - // For them to be fully equal, they must match bit-by-bit. And likewise, - // for them to *NOT* be fully equal, they have to differ just by one bit. - // The step of comparison (bits compared at once) simply does not matter. - } - - // For the rest of new instructions, dbgloc should point at the value cmp. - Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc()); - - // Emit the comparison itself. - auto *CmpCall = - cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI) - : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI)); - // FIXME: add {B,Mem}CmpInst with MemoryCompareInst - // (based on MemIntrinsicBase) as base? - // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...) - - // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal. - auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ( - CmpCall, ConstantInt::get(CmpCall->getType(), 0), - PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp")); - - BasicBlock *BB = transformBCmpControlFlow(ComparedEqual); - Builder.ClearInsertionPoint(); - - // We're done. - LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n"); - ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall", - CmpCall->getDebugLoc(), BB) - << "Transformed bcmp idiom into a call to " - << ore::NV("NewFunction", CmpCall->getCalledFunction()) - << "() function"; - }); - ++NumBCmp; -} - -/// Recognizes a bcmp idiom in a non-countable loop. -/// -/// If detected, transforms the relevant code to issue the bcmp (or memcmp) -/// intrinsic function call, and returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeBCmp() { - if (!HasMemCmp && !HasBCmp) - return false; - - ICmpInst *BCmpInst; - CmpInst *LatchCmpInst; - LoadInst *LoadA, *LoadB; - const SCEV *SrcA, *SrcB, *NBytes; - if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, - NBytes)) { - LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n"); - return false; - } - - transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes); - return true; -} diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 368b9d4e8df17..901204181a7cb 100644 --- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" @@ -226,7 +227,8 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, Optional<MemorySSAUpdater> MSSAU; if (AR.MSSA) { MSSAU = MemorySSAUpdater(AR.MSSA); - AR.MSSA->verifyMemorySSA(); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); } if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 1af4b21b432e2..6ce2d06058cf3 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -716,22 +717,6 @@ bool LoopInterchangeLegality::findInductionAndReductions( return true; } -static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { - for (PHINode &PHI : Block->phis()) { - // Reduction lcssa phi will have only 1 incoming block that from loop latch. - if (PHI.getNumIncomingValues() > 1) - return false; - Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0)); - if (!Ins) - return false; - // Incoming value for lcssa phi's in outer loop exit can only be inner loop - // exits lcssa phi else it would not be tightly nested. - if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) - return false; - } - return true; -} - // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { @@ -830,21 +815,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: We only handle LCSSA PHI's corresponding to reduction for now. - BasicBlock *InnerExit = InnerLoop->getExitBlock(); - if (!containsSafePHI(InnerExit, false)) { - LLVM_DEBUG( - dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with LCSSA PHIs can be interchange " - "currently."; - }); - return true; - } - // TODO: Current limitation: Since we split the inner loop latch at the point // were induction variable is incremented (induction.next); We cannot have // more than 1 user of induction.next since it would result in broken code @@ -920,6 +890,28 @@ bool LoopInterchangeLegality::currentLimitations() { return false; } +// We currently only support LCSSA PHI nodes in the inner loop exit, if their +// users are either reduction PHIs or PHIs outside the outer loop (which means +// the we are only interested in the final value after the loop). +static bool +areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, + SmallPtrSetImpl<PHINode *> &Reductions) { + BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); + for (PHINode &PHI : InnerExit->phis()) { + // Reduction lcssa phi will have only 1 incoming block that from loop latch. + if (PHI.getNumIncomingValues() > 1) + return false; + if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { + PHINode *PN = dyn_cast<PHINode>(U); + return !PN || (Reductions.find(PN) == Reductions.end() && + OuterL->contains(PN->getParent())); + })) { + return false; + } + } + return true; +} + // We currently support LCSSA PHI nodes in the outer loop exit, if their // incoming values do not come from the outer loop latch or if the // outer loop latch has a single predecessor. In that case, the value will @@ -927,7 +919,7 @@ bool LoopInterchangeLegality::currentLimitations() { // will still be true after interchanging. If we have multiple predecessor, // that may not be the case, e.g. because the outer loop latch may be executed // if the inner loop is not executed. -static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { +static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { // FIXME: We currently are not able to detect floating point reductions @@ -1012,7 +1004,19 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } - if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, + OuterInnerReductions)) { + LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported PHI node in loop exit."; + }); + return false; + } + + if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", @@ -1315,31 +1319,39 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { FromBB->getTerminator()->getIterator()); } -/// Update BI to jump to NewBB instead of OldBB. Records updates to -/// the dominator tree in DTUpdates, if DT should be preserved. +// Update BI to jump to NewBB instead of OldBB. Records updates to the +// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that +// \p OldBB is exactly once in BI's successor list. static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, - std::vector<DominatorTree::UpdateType> &DTUpdates) { - assert(llvm::count_if(successors(BI), - [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && - "BI must jump to OldBB at most once."); - for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { - if (BI->getSuccessor(i) == OldBB) { - BI->setSuccessor(i, NewBB); - - DTUpdates.push_back( - {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); - DTUpdates.push_back( - {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); - break; + std::vector<DominatorTree::UpdateType> &DTUpdates, + bool MustUpdateOnce = true) { + assert((!MustUpdateOnce || + llvm::count_if(successors(BI), + [OldBB](BasicBlock *BB) { + return BB == OldBB; + }) == 1) && "BI must jump to OldBB exactly once."); + bool Changed = false; + for (Use &Op : BI->operands()) + if (Op == OldBB) { + Op.set(NewBB); + Changed = true; } + + if (Changed) { + DTUpdates.push_back( + {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); + DTUpdates.push_back( + {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); } + assert(Changed && "Expected a successor to be updated"); } // Move Lcssa PHIs to the right place. static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, - BasicBlock *OuterLatch, BasicBlock *OuterExit) { + BasicBlock *OuterLatch, BasicBlock *OuterExit, + Loop *InnerLoop, LoopInfo *LI) { // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are // defined either in the header or latch. Those blocks will become header and @@ -1394,19 +1406,17 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, P->moveBefore(InnerExit->getFirstNonPHI()); // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have - // incoming values from the outer latch or header, we have to add a new PHI + // incoming values defined in the outer loop, we have to add a new PHI // in the inner loop latch, which became the exit block of the outer loop, // after interchanging. if (OuterExit) { for (PHINode &P : OuterExit->phis()) { if (P.getNumIncomingValues() != 1) continue; - // Skip Phis with incoming values not defined in the outer loop's header - // and latch. Also skip incoming phis defined in the latch. Those should + // Skip Phis with incoming values defined in the inner loop. Those should // already have been updated. auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); - if (!I || ((I->getParent() != OuterLatch || isa<PHINode>(I)) && - I->getParent() != OuterHeader)) + if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) continue; PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); @@ -1481,12 +1491,21 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!InnerLoopHeaderSuccessor) return false; - // Adjust Loop Preheader and headers + // Adjust Loop Preheader and headers. + // The branches in the outer loop predecessor and the outer loop header can + // be unconditional branches or conditional branches with duplicates. Consider + // this when updating the successors. updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, - InnerLoopPreHeader, DTUpdates); - updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); + InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); + // The outer loop header might or might not branch to the outer latch. + // We are guaranteed to branch to the inner loop preheader. + if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI), + OuterLoopLatch) != succ_end(OuterLoopHeaderBI)) + updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates, + /*MustUpdateOnce=*/false); updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, - InnerLoopHeaderSuccessor, DTUpdates); + InnerLoopHeaderSuccessor, DTUpdates, + /*MustUpdateOnce=*/false); // Adjust reduction PHI's now that the incoming block has changed. InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, @@ -1520,7 +1539,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { OuterLoopPreHeader); moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, - OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock()); + OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), + InnerLoop, LI); // For PHIs in the exit block of the outer loop, outer's latch has been // replaced by Inners'. OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index e8dc879a184b4..4e1b4e87ebc95 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -49,6 +49,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -488,7 +489,7 @@ public: // Filter the candidates further. SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; unsigned NumForwarding = 0; - for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) { LLVM_DEBUG(dbgs() << "Candidate " << Cand); // Make sure that the stored values is available everywhere in the loop in @@ -544,7 +545,8 @@ public: auto *HeaderBB = L->getHeader(); auto *F = HeaderBB->getParent(); bool OptForSize = F->hasOptSize() || - llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI); + llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 885c0e8f4b8b8..1a42f6b23443e 100644 --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -191,9 +191,12 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -248,7 +251,9 @@ struct LoopICmp { class LoopPredication { AliasAnalysis *AA; + DominatorTree *DT; ScalarEvolution *SE; + LoopInfo *LI; BranchProbabilityInfo *BPI; Loop *L; @@ -300,10 +305,13 @@ class LoopPredication { // within the loop. We identify such unprofitable loops through BPI. bool isLoopProfitableToPredicate(); + bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); + public: - LoopPredication(AliasAnalysis *AA, ScalarEvolution *SE, + LoopPredication(AliasAnalysis *AA, DominatorTree *DT, + ScalarEvolution *SE, LoopInfo *LI, BranchProbabilityInfo *BPI) - : AA(AA), SE(SE), BPI(BPI){}; + : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {}; bool runOnLoop(Loop *L); }; @@ -323,10 +331,12 @@ public: if (skipLoop(L)) return false; auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - LoopPredication LP(AA, SE, &BPI); + LoopPredication LP(AA, DT, SE, LI, &BPI); return LP.runOnLoop(L); } }; @@ -352,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); Function *F = L.getHeader()->getParent(); auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F); - LoopPredication LP(&AR.AA, &AR.SE, BPI); + LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, BPI); if (!LP.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -823,9 +833,9 @@ bool LoopPredication::widenWidenableBranchGuardConditions( Value *AllChecks = Builder.CreateAnd(Checks); auto *OldCond = BI->getCondition(); BI->setCondition(AllChecks); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); assert(isGuardAsWidenableBranch(BI) && "Stopped being a guard after transform?"); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); return true; @@ -953,6 +963,233 @@ bool LoopPredication::isLoopProfitableToPredicate() { return true; } +/// If we can (cheaply) find a widenable branch which controls entry into the +/// loop, return it. +static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) { + // Walk back through any unconditional executed blocks and see if we can find + // a widenable condition which seems to control execution of this loop. Note + // that we predict that maythrow calls are likely untaken and thus that it's + // profitable to widen a branch before a maythrow call with a condition + // afterwards even though that may cause the slow path to run in a case where + // it wouldn't have otherwise. + BasicBlock *BB = L->getLoopPreheader(); + if (!BB) + return nullptr; + do { + if (BasicBlock *Pred = BB->getSinglePredecessor()) + if (BB == Pred->getSingleSuccessor()) { + BB = Pred; + continue; + } + break; + } while (true); + + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + auto *Term = Pred->getTerminator(); + + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) && + IfTrueBB == BB) + return cast<BranchInst>(Term); + } + return nullptr; +} + +/// Return the minimum of all analyzeable exit counts. This is an upper bound +/// on the actual exit count. If there are not at least two analyzeable exits, +/// returns SCEVCouldNotCompute. +static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE, + DominatorTree &DT, + Loop *L) { + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + SmallVector<const SCEV *, 4> ExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + ExitCounts.push_back(ExitCount); + } + if (ExitCounts.size() < 2) + return SE.getCouldNotCompute(); + return SE.getUMinFromMismatchedTypes(ExitCounts); +} + +/// Return true if we can be fairly sure that executing block BB will probably +/// lead to executing an __llvm_deoptimize. This is a profitability heuristic, +/// not a legality constraint. +static bool isVeryLikelyToDeopt(BasicBlock *BB) { + while (BB->getUniqueSuccessor()) + // Will skip side effects, that's okay + BB = BB->getUniqueSuccessor(); + + return BB->getTerminatingDeoptimizeCall(); +} + +/// This implements an analogous, but entirely distinct transform from the main +/// loop predication transform. This one is phrased in terms of using a +/// widenable branch *outside* the loop to allow us to simplify loop exits in a +/// following loop. This is close in spirit to the IndVarSimplify transform +/// of the same name, but is materially different widening loosens legality +/// sharply. +bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { + // The transformation performed here aims to widen a widenable condition + // above the loop such that all analyzeable exit leading to deopt are dead. + // It assumes that the latch is the dominant exit for profitability and that + // exits branching to deoptimizing blocks are rarely taken. It relies on the + // semantics of widenable expressions for legality. (i.e. being able to fall + // down the widenable path spuriously allows us to ignore exit order, + // unanalyzeable exits, side effects, exceptional exits, and other challenges + // which restrict the applicability of the non-WC based version of this + // transform in IndVarSimplify.) + // + // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may + // imply flags on the expression being hoisted and inserting new uses (flags + // are only correct for current uses). The result is that we may be + // inserting a branch on the value which can be either poison or undef. In + // this case, the branch can legally go either way; we just need to avoid + // introducing UB. This is achieved through the use of the freeze + // instruction. + + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + if (ExitingBlocks.empty()) + return false; // Nothing to do. + + auto *Latch = L->getLoopLatch(); + if (!Latch) + return false; + + auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI); + if (!WidenableBR) + return false; + + const SCEV *LatchEC = SE->getExitCount(L, Latch); + if (isa<SCEVCouldNotCompute>(LatchEC)) + return false; // profitability - want hot exit in analyzeable set + + // At this point, we have found an analyzeable latch, and a widenable + // condition above the loop. If we have a widenable exit within the loop + // (for which we can't compute exit counts), drop the ability to further + // widen so that we gain ability to analyze it's exit count and perform this + // transform. TODO: It'd be nice to know for sure the exit became + // analyzeable after dropping widenability. + { + bool Invalidate = false; + + for (auto *ExitingBB : ExitingBlocks) { + if (LI->getLoopFor(ExitingBB) != L) + continue; + + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + Use *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && + L->contains(IfTrueBB)) { + WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); + Invalidate = true; + } + } + if (Invalidate) + SE->forgetLoop(L); + } + + // The use of umin(all analyzeable exits) instead of latch is subtle, but + // important for profitability. We may have a loop which hasn't been fully + // canonicalized just yet. If the exit we chose to widen is provably never + // taken, we want the widened form to *also* be provably never taken. We + // can't guarantee this as a current unanalyzeable exit may later become + // analyzeable, but we can at least avoid the obvious cases. + const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); + if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() || + !SE->isLoopInvariant(MinEC, L) || + !isSafeToExpandAt(MinEC, WidenableBR, *SE)) + return false; + + // Subtlety: We need to avoid inserting additional uses of the WC. We know + // that it can only have one transitive use at the moment, and thus moving + // that use to just before the branch and inserting code before it and then + // modifying the operand is legal. + auto *IP = cast<Instruction>(WidenableBR->getCondition()); + IP->moveBefore(WidenableBR); + Rewriter.setInsertPoint(IP); + IRBuilder<> B(IP); + + bool Changed = false; + Value *MinECV = nullptr; // lazily generated if needed + for (BasicBlock *ExitingBB : ExitingBlocks) { + // If our exiting block exits multiple loops, we can only rewrite the + // innermost one. Otherwise, we're changing how many times the innermost + // loop runs before it exits. + if (LI->getLoopFor(ExitingBB) != L) + continue; + + // Can't rewrite non-branch yet. + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + // If already constant, nothing to do. + if (isa<Constant>(BI->getCondition())) + continue; + + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount) || + ExitCount->getType()->isPointerTy() || + !isSafeToExpandAt(ExitCount, WidenableBR, *SE)) + continue; + + const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); + BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1); + if (!isVeryLikelyToDeopt(ExitBB)) + // Profitability: indicator of rarely/never taken exit + continue; + + // If we found a widenable exit condition, do two things: + // 1) fold the widened exit test into the widenable condition + // 2) fold the branch to untaken - avoids infinite looping + + Value *ECV = Rewriter.expandCodeFor(ExitCount); + if (!MinECV) + MinECV = Rewriter.expandCodeFor(MinEC); + Value *RHS = MinECV; + if (ECV->getType() != RHS->getType()) { + Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); + ECV = B.CreateZExt(ECV, WiderTy); + RHS = B.CreateZExt(RHS, WiderTy); + } + assert(!Latch || DT->dominates(ExitingBB, Latch)); + Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS); + // Freeze poison or undef to an arbitrary bit pattern to ensure we can + // branch without introducing UB. See NOTE ON POISON/UNDEF above for + // context. + NewCond = B.CreateFreeze(NewCond); + + widenWidenableBranch(WidenableBR, NewCond); + + Value *OldCond = BI->getCondition(); + BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue)); + Changed = true; + } + + if (Changed) + // We just mutated a bunch of loop exits changing there exit counts + // widely. We need to force recomputation of the exit counts given these + // changes. Note that all of the inserted exits are never taken, and + // should be removed next time the CFG is modified. + SE->forgetLoop(L); + return Changed; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; @@ -1004,16 +1241,12 @@ bool LoopPredication::runOnLoop(Loop *Loop) { cast<BranchInst>(BB->getTerminator())); } - if (Guards.empty() && GuardsAsWidenableBranches.empty()) - return false; - SCEVExpander Expander(*SE, *DL, "loop-predication"); - bool Changed = false; for (auto *Guard : Guards) Changed |= widenGuardConditions(Guard, Expander); for (auto *Guard : GuardsAsWidenableBranches) Changed |= widenWidenableBranchGuardConditions(Guard, Expander); - + Changed |= predicateLoopExits(L, Expander); return Changed; } diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 96e2c2a3ac6b7..da13a342ae123 100644 --- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -53,6 +53,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> #include <cstddef> diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 94517996df392..0868e742f4eee 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -18,6 +18,8 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 299f3fc5fb195..b27e65e0adb73 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -30,6 +30,8 @@ #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" @@ -660,6 +662,9 @@ static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, // Merge Succ into Pred and delete it. MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + Changed = true; } diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp index 65e0dee0225ae..1c03a4bf6c021 100644 --- a/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -41,14 +41,15 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7f119175c4a82..e9f368628a087 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -74,7 +74,6 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -97,6 +96,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -108,6 +108,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -115,8 +116,8 @@ #include <cstdlib> #include <iterator> #include <limits> -#include <numeric> #include <map> +#include <numeric> #include <utility> using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 8d88be4203141..92ad8dafa5abc 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -426,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, return UnrollResult; } +static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI, + ScalarEvolution &SE, + const TargetTransformInfo &TTI, + AssumptionCache &AC, DependenceInfo &DI, + OptimizationRemarkEmitter &ORE, + int OptLevel) { + bool DidSomething = false; + + // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop unroll and jam pass + // will simplify all loops, regardless of whether anything end up being + // unroll and jammed. + for (auto &L : LI) { + DidSomething |= + simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); + DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + SmallPriorityWorklist<Loop *, 4> Worklist; + internal::appendLoopsToWorklist(reverse(LI), Worklist); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + formLCSSA(*L, DT, &LI, &SE); + LoopUnrollResult Result = + tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); + if (Result != LoopUnrollResult::Unmodified) + DidSomething = true; + } + + return DidSomething; +} + namespace { -class LoopUnrollAndJam : public LoopPass { +class LoopUnrollAndJam : public FunctionPass { public: static char ID; // Pass ID, replacement for typeid unsigned OptLevel; - LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) { + LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) { initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipLoop(L)) + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; - Function &F = *L->getHeader()->getParent(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); - // For the old PM, we can't use OptimizationRemarkEmitter as an analysis - // pass. Function analyses need to be preserved across loop transformations - // but ORE cannot be preserved (see comment before the pass definition). - OptimizationRemarkEmitter ORE(&F); - - LoopUnrollResult Result = - tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - if (Result == LoopUnrollResult::FullyUnrolled) - LPM.markLoopAsDeleted(*L); - - return Result != LoopUnrollResult::Unmodified; + return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); } /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DependenceAnalysisWrapperPass>(); - getLoopAnalysisUsage(AU); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } }; @@ -480,10 +506,13 @@ char LoopUnrollAndJam::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) @@ -491,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) { return new LoopUnrollAndJam(OptLevel); } -PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &) { - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function *F = L.getHeader()->getParent(); - - auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); - // FIXME: This should probably be optional rather than required. - if (!ORE) - report_fatal_error( - "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at " - "a higher level"); - - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); - - LoopUnrollResult Result = tryToUnrollAndJamLoop( - &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel); - - if (Result == LoopUnrollResult::Unmodified) +PreservedAnalyses LoopUnrollAndJamPass::run(Function &F, + FunctionAnalysisManager &AM) { + ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + LoopInfo &LI = AM.getResult<LoopAnalysis>(F); + TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F); + OptimizationRemarkEmitter &ORE = + AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index a6d4164c36455..4c2b079c6bb5b 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -46,6 +46,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -212,7 +213,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( // Apply size attributes bool OptForSize = L->getHeader()->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI); + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index b410df0c5f68c..915e053704b23 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -59,6 +59,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -683,7 +684,7 @@ bool LoopUnswitch::processCurrentLoop() { for (auto &I : *BB) { auto CS = CallSite(&I); if (!CS) continue; - if (CS.hasFnAttr(Attribute::Convergent)) + if (CS.isConvergent()) return false; if (auto *II = dyn_cast<InvokeInst>(&I)) if (!II->getUnwindDest()->canSplitPredecessors()) diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 2ccb7cae3079e..7b9af527d444e 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -79,6 +79,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index e076424d90425..ab7b85e89e7bd 100644 --- a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp index d0fcf38b5a7bd..21c6c32e8e02a 100644 --- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index d85f20b3f80c1..53671c7bc3d15 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index 9489e01774d64..45f5929e3b903 100644 --- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -60,7 +61,7 @@ static bool lowerGuardIntrinsic(Function &F) { DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); for (auto *CI : ToLower) { - makeGuardControlFlowExplicit(DeoptIntrinsic, CI); + makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false); CI->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp new file mode 100644 index 0000000000000..0ff6ee8bcfcc2 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -0,0 +1,894 @@ +//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower matrix intrinsics to vector operations. +// +// TODO: +// * Implement multiply & add fusion +// * Add remark, summarizing the available matrix optimization opportunities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "lower-matrix-intrinsics" + +static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape", + cl::init(true)); + +static cl::opt<bool> AllowContractEnabled( + "matrix-allow-contract", cl::init(false), cl::Hidden, + cl::desc("Allow the use of FMAs if available and profitable. This may " + "result in different results, due to less rounding error.")); + +namespace { + +// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute +// the start address of column \p Col with type (\p EltType x \p NumRows) +// assuming \p Stride elements between start two consecutive columns. +// \p Stride must be >= \p NumRows. +// +// Consider a 4x4 matrix like below +// +// 0 1 2 3 +// 0 v_0_0 v_0_1 v_0_2 v_0_3 +// 1 v_1_0 v_1_1 v_1_2 v_1_3 +// 2 v_2_0 v_2_1 v_2_2 v_2_3 +// 3 v_3_0 v_3_1 v_3_2 v_3_3 + +// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, +// we need a pointer to the first element of the submatrix as base pointer. +// Then we can use computeColumnAddr to compute the addresses for the columns +// of the sub-matrix. +// +// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) +// -> just returns Base +// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (1 * 4) +// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (2 * 4) +// +// The graphic below illustrates the number of elements in a column (marked +// with |) and the number of skipped elements (marked with }). +// +// v_0_0 v_0_1 {v_0_2 {v_0_3 +// Base Col 1 Col 2 +// | | | +// v_1_0 |v_1_1 |v_1_2 |v_1_3 +// v_2_0 |v_2_1 |v_2_2 |v_2_3 +// v_3_0 {v_3_1 {v_3_2 v_3_3 +// +Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride, + unsigned NumRows, Type *EltType, + IRBuilder<> &Builder) { + + assert((!isa<ConstantInt>(Stride) || + cast<ConstantInt>(Stride)->getZExtValue() >= NumRows) && + "Stride must be >= the number of rows."); + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + + // Compute the start of the column with index Col as Col * Stride. + Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start"); + + // Get pointer to the start of the selected column. Skip GEP creation, + // if we select column 0. + if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero()) + ColumnStart = BasePtr; + else + ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep"); + + // Cast elementwise column start pointer to a pointer to a column + // (EltType x NumRows)*. + Type *ColumnType = VectorType::get(EltType, NumRows); + Type *ColumnPtrType = PointerType::get(ColumnType, AS); + return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast"); +} + +/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. +/// +/// Currently, the lowering for each matrix intrinsic is done as follows: +/// 1. Propagate the shape information from intrinsics to connected +/// instructions. +/// 2. Lower instructions with shape information. +/// 2.1. Get column vectors for each argument. If we already lowered the +/// definition of an argument, use the produced column vectors directly. +/// If not, split the operand vector containing an embedded matrix into +/// a set of column vectors, +/// 2.2. Lower the instruction in terms of columnwise operations, which yields +/// a set of column vectors containing result matrix. Note that we lower +/// all instructions that have shape information. Besides the intrinsics, +/// this includes stores for example. +/// 2.3. Update uses of the lowered instruction. If we have shape information +/// for a user, there is nothing to do, as we will look up the result +/// column matrix when lowering the user. For other uses, we embed the +/// result matrix in a flat vector and update the use. +/// 2.4. Cache the result column matrix for the instruction we lowered +/// 3. After we lowered all instructions in a function, remove the now +/// obsolete instructions. +/// +class LowerMatrixIntrinsics { + Function &Func; + const DataLayout &DL; + const TargetTransformInfo &TTI; + + /// Wrapper class representing a matrix as a set of column vectors. + /// All column vectors must have the same vector type. + class ColumnMatrixTy { + SmallVector<Value *, 16> Columns; + + public: + ColumnMatrixTy() : Columns() {} + ColumnMatrixTy(ArrayRef<Value *> Cols) + : Columns(Cols.begin(), Cols.end()) {} + + Value *getColumn(unsigned i) const { return Columns[i]; } + + void setColumn(unsigned i, Value *V) { Columns[i] = V; } + + size_t getNumColumns() const { return Columns.size(); } + size_t getNumRows() const { + assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); + return cast<VectorType>(Columns[0]->getType())->getNumElements(); + } + + const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; } + + SmallVectorImpl<Value *> &getColumnVectors() { return Columns; } + + void addColumn(Value *V) { Columns.push_back(V); } + + iterator_range<SmallVector<Value *, 8>::iterator> columns() { + return make_range(Columns.begin(), Columns.end()); + } + + /// Embed the columns of the matrix into a flat vector by concatenating + /// them. + Value *embedInVector(IRBuilder<> &Builder) const { + return Columns.size() == 1 ? Columns[0] + : concatenateVectors(Builder, Columns); + } + }; + + struct ShapeInfo { + unsigned NumRows; + unsigned NumColumns; + + ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) + : NumRows(NumRows), NumColumns(NumColumns) {} + + ShapeInfo(Value *NumRows, Value *NumColumns) + : NumRows(cast<ConstantInt>(NumRows)->getZExtValue()), + NumColumns(cast<ConstantInt>(NumColumns)->getZExtValue()) {} + + bool operator==(const ShapeInfo &other) { + return NumRows == other.NumRows && NumColumns == other.NumColumns; + } + bool operator!=(const ShapeInfo &other) { return !(*this == other); } + + /// Returns true if shape-information is defined, meaning both dimensions + /// are != 0. + operator bool() const { + assert(NumRows == 0 || NumColumns != 0); + return NumRows != 0; + } + }; + + /// Maps instructions to their shape information. The shape information + /// describes the shape to be used while lowering. This matches the shape of + /// the result value of the instruction, with the only exceptions being store + /// instructions and the matrix_columnwise_store intrinsics. For those, the + /// shape information indicates that those instructions should be lowered + /// using shape information as well. + DenseMap<Value *, ShapeInfo> ShapeMap; + + /// List of instructions to remove. While lowering, we are not replacing all + /// users of a lowered instruction, if shape information is available and + /// those need to be removed after we finished lowering. + SmallVector<Instruction *, 16> ToRemove; + + /// Map from instructions to their produced column matrix. + DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix; + +public: + LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI) + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {} + + /// Return the set of column vectors that a matrix value is lowered to. + /// + /// If we lowered \p MatrixVal, just return the cache result column matrix. + /// Otherwie split the flat vector \p MatrixVal containing a matrix with + /// shape \p SI into column vectors. + ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> Builder) { + VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType()); + assert(VType && "MatrixVal must be a vector type"); + assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && + "The vector size must match the number of matrix elements"); + + // Check if we lowered MatrixVal using shape information. In that case, + // return the existing column matrix, if it matches the requested shape + // information. If there is a mis-match, embed the result in a flat + // vector and split it later. + auto Found = Inst2ColumnMatrix.find(MatrixVal); + if (Found != Inst2ColumnMatrix.end()) { + ColumnMatrixTy &M = Found->second; + // Return the found matrix, if its shape matches the requested shape + // information + if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) + return M; + + MatrixVal = M.embedInVector(Builder); + } + + // Otherwise split MatrixVal. + SmallVector<Value *, 16> SplitVecs; + Value *Undef = UndefValue::get(VType); + for (unsigned MaskStart = 0; MaskStart < VType->getNumElements(); + MaskStart += SI.NumRows) { + Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0); + Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split"); + SplitVecs.push_back(V); + } + + return {SplitVecs}; + } + + /// If \p V already has a known shape return false. Otherwise set the shape + /// for instructions that support it. + bool setShapeInfo(Value *V, ShapeInfo Shape) { + assert(Shape && "Shape not set"); + if (isa<UndefValue>(V) || !supportsShapeInfo(V)) + return false; + + auto SIter = ShapeMap.find(V); + if (SIter != ShapeMap.end()) { + LLVM_DEBUG(dbgs() << " not overriding existing shape: " + << SIter->second.NumRows << " " + << SIter->second.NumColumns << " for " << *V << "\n"); + return false; + } + + ShapeMap.insert({V, Shape}); + LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns + << " for " << *V << "\n"); + return true; + } + + bool isUniformShape(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return true; + + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: // Scalar multiply. + case Instruction::Add: + case Instruction::Mul: + case Instruction::Sub: + return true; + default: + return false; + } + } + + /// Returns true if shape information can be used for \p V. The supported + /// instructions must match the instructions that can be lowered by this pass. + bool supportsShapeInfo(Value *V) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II) + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + return true; + default: + return false; + } + return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V); + } + + /// Propagate the shape information of instructions to their users. + /// The work list contains instructions for which we can compute the shape, + /// either based on the information provided by matrix intrinsics or known + /// shapes of operands. + SmallVector<Instruction *, 32> + propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + // Pop an element for which we guaranteed to have at least one of the + // operand shapes. Add the shape for this and then add users to the work + // list. + LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); + while (!WorkList.empty()) { + Instruction *Inst = WorkList.back(); + WorkList.pop_back(); + + // New entry, set the value and insert operands + bool Propagate = false; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + Propagate = setShapeInfo(Inst, {M, K}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, + m_Intrinsic<Intrinsic::matrix_columnwise_load>( + m_Value(), m_Value(), m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {M, N}); + } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { + auto OpShape = ShapeMap.find(MatrixA); + if (OpShape != ShapeMap.end()) + setShapeInfo(Inst, OpShape->second); + continue; + } else if (isUniformShape(Inst)) { + // Find the first operand that has a known shape and use that. + for (auto &Op : Inst->operands()) { + auto OpShape = ShapeMap.find(Op.get()); + if (OpShape != ShapeMap.end()) { + Propagate |= setShapeInfo(Inst, OpShape->second); + break; + } + } + } + + if (Propagate) { + NewWorkList.push_back(Inst); + for (auto *User : Inst->users()) + if (ShapeMap.count(User) == 0) + WorkList.push_back(cast<Instruction>(User)); + } + } + + return NewWorkList; + } + + /// Propagate the shape to operands of instructions with shape information. + /// \p Worklist contains the instruction for which we already know the shape. + SmallVector<Instruction *, 32> + propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + + auto pushInstruction = [](Value *V, + SmallVectorImpl<Instruction *> &WorkList) { + Instruction *I = dyn_cast<Instruction>(V); + if (I) + WorkList.push_back(I); + }; + // Pop an element with known shape. Traverse the operands, if their shape + // derives from the result shape and is unknown, add it and add them to the + // worklist. + LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + + size_t BeforeProcessingV = WorkList.size(); + if (!isa<Instruction>(V)) + continue; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + + if (setShapeInfo(MatrixB, {N, K})) + pushInstruction(MatrixB, WorkList); + + } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + if (setShapeInfo(MatrixA, {M, N})) { + pushInstruction(MatrixA, WorkList); + } + } else if (isa<LoadInst>(V) || + match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) { + // Nothing to do, no matrix input. + } else if (isa<StoreInst>(V)) { + // Nothing to do. We forward-propagated to this so we would just + // backward propagate to an instruction with an already known shape. + } else if (isUniformShape(V)) { + // Propagate to all operands. + ShapeInfo Shape = ShapeMap[V]; + for (Use &U : cast<Instruction>(V)->operands()) { + if (setShapeInfo(U.get(), Shape)) + pushInstruction(U.get(), WorkList); + } + } + // After we discovered new shape info for new instructions in the + // worklist, we use their users as seeds for the next round of forward + // propagation. + for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) + for (User *U : WorkList[I]->users()) + if (isa<Instruction>(U) && V != U) + NewWorkList.push_back(cast<Instruction>(U)); + } + return NewWorkList; + } + + bool Visit() { + if (EnableShapePropagation) { + SmallVector<Instruction *, 32> WorkList; + + // Initially only the shape of matrix intrinsics is known. + // Initialize the work list with ops carrying shape information. + for (BasicBlock &BB : Func) + for (Instruction &Inst : BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst); + if (!II) + continue; + + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + WorkList.push_back(&Inst); + break; + default: + break; + } + } + // Propagate shapes until nothing changes any longer. + while (!WorkList.empty()) { + WorkList = propagateShapeForward(WorkList); + WorkList = propagateShapeBackward(WorkList); + } + } + + ReversePostOrderTraversal<Function *> RPOT(&Func); + bool Changed = false; + for (auto *BB : RPOT) { + for (Instruction &Inst : make_early_inc_range(*BB)) { + IRBuilder<> Builder(&Inst); + + if (CallInst *CInst = dyn_cast<CallInst>(&Inst)) + Changed |= VisitCallInst(CInst); + + Value *Op1; + Value *Op2; + if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst)) + Changed |= VisitBinaryOperator(BinOp); + if (match(&Inst, m_Load(m_Value(Op1)))) + Changed |= VisitLoad(&Inst, Op1, Builder); + else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2)))) + Changed |= VisitStore(&Inst, Op1, Op2, Builder); + } + } + + for (Instruction *Inst : reverse(ToRemove)) + Inst->eraseFromParent(); + + return Changed; + } + + LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType, + IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load"); + } + + StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr, + Type *EltType, IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align); + } + + + /// Turns \p BasePtr into an elementwise pointer to \p EltType. + Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + Type *EltPtrType = PointerType::get(EltType, AS); + return Builder.CreatePointerCast(BasePtr, EltPtrType); + } + + /// Replace intrinsic calls + bool VisitCallInst(CallInst *Inst) { + if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) + return false; + + switch (Inst->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + LowerMultiply(Inst); + break; + case Intrinsic::matrix_transpose: + LowerTranspose(Inst); + break; + case Intrinsic::matrix_columnwise_load: + LowerColumnwiseLoad(Inst); + break; + case Intrinsic::matrix_columnwise_store: + LowerColumnwiseStore(Inst); + break; + default: + return false; + } + return true; + } + + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Inst->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + ColumnMatrixTy Result; + // Distance between start of one column and the start of the next + for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows, + VType->getElementType(), Builder); + Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); + Result.addColumn(Column); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lowers llvm.matrix.columnwise.load. + /// + /// The intrinsic loads a matrix from memory using a stride between columns. + void LowerColumnwiseLoad(CallInst *Inst) { + Value *Ptr = Inst->getArgOperand(0); + Value *Stride = Inst->getArgOperand(1); + LowerLoad(Inst, Ptr, Stride, + {Inst->getArgOperand(2), Inst->getArgOperand(3)}); + } + + void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Matrix->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + auto LM = getMatrix(Matrix, Shape, Builder); + for (auto C : enumerate(LM.columns())) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, + Shape.NumRows, VType->getElementType(), Builder); + createColumnStore(C.value(), GEP, VType->getElementType(), Builder); + } + + ToRemove.push_back(Inst); + } + + /// Lowers llvm.matrix.columnwise.store. + /// + /// The intrinsic store a matrix back memory using a stride between columns. + void LowerColumnwiseStore(CallInst *Inst) { + Value *Matrix = Inst->getArgOperand(0); + Value *Ptr = Inst->getArgOperand(1); + Value *Stride = Inst->getArgOperand(2); + LowerStore(Inst, Matrix, Ptr, Stride, + {Inst->getArgOperand(3), Inst->getArgOperand(4)}); + } + + /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from + /// the matrix \p LM represented as a vector of column vectors. + Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + unsigned NumElts, IRBuilder<> Builder) { + Value *Col = LM.getColumn(J); + Value *Undef = UndefValue::get(Col->getType()); + Constant *Mask = createSequentialMask(Builder, I, NumElts, 0); + return Builder.CreateShuffleVector(Col, Undef, Mask, "block"); + } + + // Set elements I..I+NumElts-1 to Block + Value *insertVector(Value *Col, unsigned I, Value *Block, + IRBuilder<> Builder) { + + // First, bring Block to the same size as Col + unsigned BlockNumElts = + cast<VectorType>(Block->getType())->getNumElements(); + unsigned NumElts = cast<VectorType>(Col->getType())->getNumElements(); + assert(NumElts >= BlockNumElts && "Too few elements for current block"); + + Value *ExtendMask = + createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts); + Value *Undef = UndefValue::get(Block->getType()); + Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask); + + // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, + // 8, 4, 5, 6 + SmallVector<Constant *, 16> Mask; + unsigned i; + for (i = 0; i < I; i++) + Mask.push_back(Builder.getInt32(i)); + + unsigned VecNumElts = cast<VectorType>(Col->getType())->getNumElements(); + for (; i < I + BlockNumElts; i++) + Mask.push_back(Builder.getInt32(i - I + VecNumElts)); + + for (; i < VecNumElts; i++) + Mask.push_back(Builder.getInt32(i)); + + Value *MaskVal = ConstantVector::get(Mask); + + return Builder.CreateShuffleVector(Col, Block, MaskVal); + } + + Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, + IRBuilder<> &Builder, bool AllowContraction) { + + if (!Sum) + return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + + if (UseFPOp) { + if (AllowContraction) { + // Use fmuladd for floating point operations and let the backend decide + // if that's profitable. + Value *FMulAdd = Intrinsic::getDeclaration( + Func.getParent(), Intrinsic::fmuladd, A->getType()); + return Builder.CreateCall(FMulAdd, {A, B, Sum}); + } + Value *Mul = Builder.CreateFMul(A, B); + return Builder.CreateFAdd(Sum, Mul); + } + + Value *Mul = Builder.CreateMul(A, B); + return Builder.CreateAdd(Sum, Mul); + } + + /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For + /// users with shape information, there's nothing to do: the will use the + /// cached value when they are lowered. For other users, \p Matrix is + /// flattened and the uses are updated to use it. Also marks \p Inst for + /// deletion. + void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix, + IRBuilder<> &Builder) { + Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); + + ToRemove.push_back(Inst); + Value *Flattened = nullptr; + for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) { + Use &U = *I++; + if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { + if (!Flattened) + Flattened = Matrix.embedInVector(Builder); + U.set(Flattened); + } + } + } + + /// Lowers llvm.matrix.multiply. + void LowerMultiply(CallInst *MatMul) { + IRBuilder<> Builder(MatMul); + auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const ColumnMatrixTy &Lhs = + getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const ColumnMatrixTy &Rhs = + getMatrix(MatMul->getArgOperand(1), RShape, Builder); + + const unsigned R = LShape.NumRows; + const unsigned M = LShape.NumColumns; + const unsigned C = RShape.NumColumns; + assert(M == RShape.NumRows); + + // Initialize the output + ColumnMatrixTy Result; + for (unsigned J = 0; J < C; ++J) + Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); + + const unsigned VF = std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits(), + uint64_t(1)); + + bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && + MatMul->hasAllowContract()); + // Multiply columns from the first operand with scalars from the second + // operand. Then move along the K axes and accumulate the columns. With + // this the adds can be vectorized without reassociation. + for (unsigned J = 0; J < C; ++J) { + unsigned BlockSize = VF; + for (unsigned I = 0; I < R; I += BlockSize) { + // Gradually lower the vectorization factor to cover the remainder. + while (I + BlockSize > R) + BlockSize /= 2; + + Value *Sum = nullptr; + for (unsigned K = 0; K < M; ++K) { + Value *L = extractVector(Lhs, I, K, BlockSize, Builder); + Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K); + Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); + Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(), + Builder, AllowContract); + } + Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); + } + } + finalizeLowering(MatMul, Result, Builder); + } + + /// Lowers llvm.matrix.transpose. + void LowerTranspose(CallInst *Inst) { + ColumnMatrixTy Result; + IRBuilder<> Builder(Inst); + Value *InputVal = Inst->getArgOperand(0); + VectorType *VectorTy = cast<VectorType>(InputVal->getType()); + ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); + ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + + for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { + // Build a single column vector for this row. First initialize it. + Value *ResultColumn = UndefValue::get( + VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns)); + + // Go through the elements of this row and insert it into the resulting + // column vector. + for (auto C : enumerate(InputMatrix.columns())) { + Value *Elt = Builder.CreateExtractElement(C.value(), Row); + // We insert at index Column since that is the row index after the + // transpose. + ResultColumn = + Builder.CreateInsertElement(ResultColumn, Elt, C.index()); + } + Result.addColumn(ResultColumn); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lower load instructions, if shape information is available. + bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr, + IRBuilder<> &Builder) { + auto I = ShapeMap.find(StoredVal); + if (I == ShapeMap.end()) + return false; + + LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + /// Lower binary operators, if shape information is available. + bool VisitBinaryOperator(BinaryOperator *Inst) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + Value *Lhs = Inst->getOperand(0); + Value *Rhs = Inst->getOperand(1); + + IRBuilder<> Builder(Inst); + ShapeInfo &Shape = I->second; + + ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); + ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); + + // Add each column and store the result back into the opmapping + ColumnMatrixTy Result; + auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) { + switch (Inst->getOpcode()) { + case Instruction::Add: + return Builder.CreateAdd(LHS, RHS); + case Instruction::Mul: + return Builder.CreateMul(LHS, RHS); + case Instruction::Sub: + return Builder.CreateSub(LHS, RHS); + case Instruction::FAdd: + return Builder.CreateFAdd(LHS, RHS); + case Instruction::FMul: + return Builder.CreateFMul(LHS, RHS); + case Instruction::FSub: + return Builder.CreateFSub(LHS, RHS); + default: + llvm_unreachable("Unsupported binary operator for matrix"); + } + }; + for (unsigned C = 0; C < Shape.NumColumns; ++C) + Result.addColumn( + BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C))); + + finalizeLowering(Inst, Result, Builder); + return true; + } +}; +} // namespace + +PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + LowerMatrixIntrinsics LMT(F, TTI); + if (LMT.Visit()) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + return PreservedAnalyses::all(); +} + +namespace { + +class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { +public: + static char ID; + + LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { + initializeLowerMatrixIntrinsicsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + LowerMatrixIntrinsics LMT(F, *TTI); + bool C = LMT.Visit(); + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} // namespace + +static const char pass_name[] = "Lower the matrix intrinsics"; +char LowerMatrixIntrinsicsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) +INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) + +Pass *llvm::createLowerMatrixIntrinsicsPass() { + return new LowerMatrixIntrinsicsLegacyPass(); +} diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp index 5342f2ddcb6b1..73b2cd06fa230 100644 --- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp +++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index 789232e0f5ce7..5ffae128f5f0f 100644 --- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -33,10 +33,11 @@ #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Analysis/GuardUtils.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -56,23 +57,11 @@ struct MakeGuardsExplicitLegacyPass : public FunctionPass { static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { // Replace the guard with an explicit branch (just like in GuardWidening). - BasicBlock *BB = Guard->getParent(); - makeGuardControlFlowExplicit(DeoptIntrinsic, Guard); - BranchInst *ExplicitGuard = cast<BranchInst>(BB->getTerminator()); - assert(ExplicitGuard->isConditional() && "Must be!"); + BasicBlock *OriginalBB = Guard->getParent(); + (void)OriginalBB; + makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true); + assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold"); - // We want the guard to be expressed as explicit control flow, but still be - // widenable. For that, we add Widenable Condition intrinsic call to the - // guard's condition. - IRBuilder<> B(ExplicitGuard); - auto *WidenableCondition = - B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, - {}, {}, nullptr, "widenable_cond"); - WidenableCondition->setCallingConv(Guard->getCallingConv()); - auto *NewCond = - B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition); - NewCond->setName("exiplicit_guard_cond"); - ExplicitGuard->setCondition(NewCond); Guard->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 2364748efb057..c24fa40860eb3 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -24,7 +24,6 @@ #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -387,16 +388,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, StartPtr = Range.StartPtr; // Determine alignment - unsigned Alignment = Range.Alignment; - if (Alignment == 0) { - Type *EltType = - cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = DL.getABITypeAlignment(EltType); - } - - AMemSet = - Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + const Align Alignment = DL.getValueOrABITypeAlignment( + MaybeAlign(Range.Alignment), + cast<PointerType>(StartPtr->getType())->getElementType()); + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, + Alignment); LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) dbgs() << *SI << '\n'; @@ -416,25 +413,21 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, return AMemSet; } -static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) - StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); - return StoreAlign; +static Align findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), + SI->getOperand(0)->getType()); } -static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); - return LoadAlign; +static Align findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(LI->getAlignment()), + LI->getType()); } -static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, - const LoadInst *LI) { - unsigned StoreAlign = findStoreAlignment(DL, SI); - unsigned LoadAlign = findLoadAlignment(DL, LI); - return MinAlign(StoreAlign, LoadAlign); +static Align findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + Align StoreAlign = findStoreAlignment(DL, SI); + Align LoadAlign = findLoadAlignment(DL, LI); + return commonAlignment(StoreAlign, LoadAlign); } // This method try to lift a store instruction before position P. @@ -649,7 +642,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), - findCommonAlignment(DL, SI, LI), C); + findCommonAlignment(DL, SI, LI).value(), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -682,12 +675,11 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { auto *T = V->getType(); if (T->isAggregateType()) { uint64_t Size = DL.getTypeStoreSize(T); - unsigned Align = SI->getAlignment(); - if (!Align) - Align = DL.getABITypeAlignment(T); + const Align MA = + DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), T); IRBuilder<> Builder(SI); auto *M = - Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, Align); + Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, MA); LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); @@ -982,12 +974,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. IRBuilder<> Builder(M); if (UseMemMove) - Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); else - Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); // Remove the instruction we're replacing. @@ -1057,7 +1049,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, Builder.CreateMemSet( Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest, SrcSize), - MemSet->getOperand(1), MemsetLen, Align); + MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); MD->removeInstruction(MemSet); MemSet->eraseFromParent(); @@ -1125,8 +1117,8 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, } IRBuilder<> Builder(MemCpy); - Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), - CopySize, MemCpy->getDestAlignment()); + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize, + MaybeAlign(MemCpy->getDestAlignment())); return true; } @@ -1153,7 +1145,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { M->getModule()->getDataLayout())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), - M->getDestAlignment(), false); + MaybeAlign(M->getDestAlignment()), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 98a45b391319e..ce1e142101b82 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -50,6 +50,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 9799ea7960ec5..6b0d0202d9bb9 100644 --- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -83,6 +83,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 1260bd39cdee3..bba9082e31b2f 100644 --- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -82,7 +82,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -101,10 +100,12 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index b213264de557e..6a643480f3128 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -76,7 +76,6 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -94,6 +93,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -106,6 +106,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include <algorithm> @@ -489,11 +490,11 @@ namespace { class NewGVN { Function &F; - DominatorTree *DT; - const TargetLibraryInfo *TLI; - AliasAnalysis *AA; - MemorySSA *MSSA; - MemorySSAWalker *MSSAWalker; + DominatorTree *DT = nullptr; + const TargetLibraryInfo *TLI = nullptr; + AliasAnalysis *AA = nullptr; + MemorySSA *MSSA = nullptr; + MemorySSAWalker *MSSAWalker = nullptr; const DataLayout &DL; std::unique_ptr<PredicateInfo> PredInfo; @@ -505,7 +506,7 @@ class NewGVN { const SimplifyQuery SQ; // Number of function arguments, used by ranking - unsigned int NumFuncArgs; + unsigned int NumFuncArgs = 0; // RPOOrdering of basic blocks DenseMap<const DomTreeNode *, unsigned> RPOOrdering; @@ -516,9 +517,9 @@ class NewGVN { // startsout in, and represents any value. Being an optimistic analysis, // anything in the TOP class has the value TOP, which is indeterminate and // equivalent to everything. - CongruenceClass *TOPClass; + CongruenceClass *TOPClass = nullptr; std::vector<CongruenceClass *> CongruenceClasses; - unsigned NextCongruenceNum; + unsigned NextCongruenceNum = 0; // Value Mappings. DenseMap<Value *, CongruenceClass *> ValueToClass; @@ -862,7 +863,7 @@ private: // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. - int64_t StartingVNCounter; + int64_t StartingVNCounter = 0; }; } // end anonymous namespace diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 68a0f5151ad50..58763ec72eced 100644 --- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index beb299272ed8d..5c4a89977c388 100644 --- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -47,6 +47,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ADT/SetVector.h" diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index 124f625ef7b66..41940e980faa9 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -30,7 +30,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -50,12 +49,14 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <utility> @@ -173,7 +174,7 @@ void ReassociatePass::BuildRankMap(Function &F, << "\n"); } - // Traverse basic blocks in ReversePostOrder + // Traverse basic blocks in ReversePostOrder. for (BasicBlock *BB : RPOT) { unsigned BBRank = RankMap[BB] = ++Rank << 16; @@ -1898,6 +1899,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, ValueRankMap.erase(I); Insts.remove(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); for (auto Op : Ops) if (Instruction *OpInst = dyn_cast<Instruction>(Op)) @@ -1914,6 +1916,7 @@ void ReassociatePass::EraseInst(Instruction *I) { // Erase the dead instruction. ValueRankMap.erase(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); // Optimize its operands. SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 3296322e00d5f..0716c13209820 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,16 +16,17 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 48bbdd8d1b33f..b242f100fafff 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 10fbdc8aacd2f..e696ea83a300e 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -29,7 +29,6 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include <cassert> #include <utility> @@ -2196,7 +2197,7 @@ bool llvm::runIPSCCP( findReturnsToZap(*F, ReturnsToZap, Solver); } - for (const auto &F : Solver.getMRVFunctionsTracked()) { + for (auto F : Solver.getMRVFunctionsTracked()) { assert(F->getReturnType()->isStructTy() && "The return type should be a struct"); StructType *STy = cast<StructType>(F->getReturnType()); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 74b8ff9130502..89916e43fce29 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -41,7 +41,6 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -71,6 +70,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -80,6 +80,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> @@ -361,7 +362,7 @@ private: /// The beginning and ending offsets of the alloca for this /// partition. - uint64_t BeginOffset, EndOffset; + uint64_t BeginOffset = 0, EndOffset = 0; /// The start and end iterators of this partition. iterator SI, SJ; @@ -1680,24 +1681,20 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, } /// Compute the adjusted alignment for a load or store from an offset. -static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, - const DataLayout &DL) { - unsigned Alignment; +static Align getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + MaybeAlign Alignment; Type *Ty; if (auto *LI = dyn_cast<LoadInst>(I)) { - Alignment = LI->getAlignment(); + Alignment = MaybeAlign(LI->getAlignment()); Ty = LI->getType(); } else if (auto *SI = dyn_cast<StoreInst>(I)) { - Alignment = SI->getAlignment(); + Alignment = MaybeAlign(SI->getAlignment()); Ty = SI->getValueOperand()->getType(); } else { llvm_unreachable("Only loads and stores are allowed!"); } - - if (!Alignment) - Alignment = DL.getABITypeAlignment(Ty); - - return MinAlign(Alignment, Offset); + return commonAlignment(DL.getValueOrABITypeAlignment(Alignment, Ty), Offset); } /// Test whether we can convert a value from the old to the new type. @@ -2300,9 +2297,9 @@ class llvm::sroa::AllocaSliceRewriter // The new offsets of the slice currently being rewritten relative to the // original alloca. - uint64_t NewBeginOffset, NewEndOffset; + uint64_t NewBeginOffset = 0, NewEndOffset = 0; - uint64_t SliceSize; + uint64_t SliceSize = 0; bool IsSplittable = false; bool IsSplit = false; Use *OldUse = nullptr; @@ -2432,13 +2429,14 @@ private: /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. - unsigned getSliceAlign(Type *Ty = nullptr) { - unsigned NewAIAlign = NewAI.getAlignment(); - if (!NewAIAlign) - NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = - MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); - return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; + MaybeAlign getSliceAlign(Type *Ty = nullptr) { + const MaybeAlign NewAIAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(NewAI.getAlignment()), NewAI.getAllocatedType()); + const MaybeAlign Align = + commonAlignment(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + return (Ty && Align && Align->value() == DL.getABITypeAlignment(Ty)) + ? None + : Align; } unsigned getIndex(uint64_t Offset) { @@ -2800,7 +2798,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, - getSliceAlign(), II.isVolatile()); + MaybeAlign(getSliceAlign()), II.isVolatile()); if (AATags) New->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); @@ -2886,7 +2884,7 @@ private: assert((IsDest && II.getRawDest() == OldPtr) || (!IsDest && II.getRawSource() == OldPtr)); - unsigned SliceAlign = getSliceAlign(); + MaybeAlign SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2956,10 +2954,10 @@ private: // Compute the relative offset for the other pointer within the transfer. unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); - unsigned OtherAlign = - IsDest ? II.getSourceAlignment() : II.getDestAlignment(); - OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1, - OtherOffset.zextOrTrunc(64).getZExtValue()); + Align OtherAlign = + assumeAligned(IsDest ? II.getSourceAlignment() : II.getDestAlignment()); + OtherAlign = + commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce @@ -2972,7 +2970,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); Value *DestPtr, *SrcPtr; - unsigned DestAlign, SrcAlign; + MaybeAlign DestAlign, SrcAlign; // Note: IsDest is true iff we're copying into the new alloca slice if (IsDest) { DestPtr = OurPtr; @@ -3019,9 +3017,9 @@ private: Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); - unsigned SrcAlign = OtherAlign; + MaybeAlign SrcAlign = OtherAlign; Value *DstPtr = &NewAI; - unsigned DstAlign = SliceAlign; + MaybeAlign DstAlign = SliceAlign; if (!IsDest) { std::swap(SrcPtr, DstPtr); std::swap(SrcAlign, DstAlign); @@ -3117,20 +3115,17 @@ private: Instruction *I = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); - LI->setAlignment(MaybeAlign(std::min(LoadAlign, getSliceAlign()))); + MaybeAlign LoadAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(LI->getAlignment()), LI->getType()); + LI->setAlignment(std::min(LoadAlign, getSliceAlign())); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) { Value *Op = SI->getOperand(0); - StoreAlign = DL.getABITypeAlignment(Op->getType()); - } - SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); - continue; + MaybeAlign StoreAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(SI->getAlignment()), Op->getType()); + SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + continue; } assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || @@ -3222,7 +3217,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { /// The current pointer use being rewritten. This is used to dig up the used /// value (as opposed to the user). - Use *U; + Use *U = nullptr; /// Used to calculate offsets, and hence alignment, of subobjects. const DataLayout &DL; @@ -3277,7 +3272,7 @@ private: Type *BaseTy; /// Known alignment of the base pointer. - unsigned BaseAlign; + Align BaseAlign; /// To calculate offset of each component so we can correctly deduce /// alignments. @@ -3286,7 +3281,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - unsigned BaseAlign, const DataLayout &DL) + Align BaseAlign, const DataLayout &DL) : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} @@ -3308,7 +3303,7 @@ private: if (Ty->isSingleValueType()) { unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); return static_cast<Derived *>(this)->emitFunc( - Ty, Agg, MinAlign(BaseAlign, Offset), Name); + Ty, Agg, commonAlignment(BaseAlign, Offset), Name); } if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { @@ -3349,18 +3344,20 @@ private: AAMDNodes AATags; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), AATags(AATags) {} + DL), + AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); - LoadInst *Load = IRB.CreateAlignedLoad(Ty, GEP, Align, Name + ".load"); + LoadInst *Load = + IRB.CreateAlignedLoad(Ty, GEP, Alignment.value(), Name + ".load"); if (AATags) Load->setAAMetadata(AATags); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); @@ -3388,14 +3385,14 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL), AATags(AATags) {} AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. // @@ -3406,7 +3403,7 @@ private: Value *InBoundsGEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); StoreInst *Store = - IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align); + IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment.value()); if (AATags) Store->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -3863,8 +3860,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, BasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); @@ -3921,7 +3918,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); @@ -4005,8 +4003,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), LoadPartPtrTy, LoadBasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); } // And store this partition. @@ -4017,7 +4015,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), StorePartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( @@ -4152,20 +4151,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // FIXME: We might want to defer PHI speculation until after here. // FIXME: return nullptr; } else { - unsigned Alignment = AI.getAlignment(); - if (!Alignment) { - // The minimum alignment which users can rely on when the explicit - // alignment is omitted or zero is that required by the ABI for this - // type. - Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); - } - Alignment = MinAlign(Alignment, P.beginOffset()); + // If alignment is unspecified we fallback on the one required by the ABI + // for this type. We also make sure the alignment is compatible with + // P.beginOffset(). + const Align Alignment = commonAlignment( + DL.getValueOrABITypeAlignment(MaybeAlign(AI.getAlignment()), + AI.getAllocatedType()), + P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL.getABITypeAlignment(SliceTy)) - Alignment = 0; + const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy); NewAI = new AllocaInst( - SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment, + SliceTy, AI.getType()->getAddressSpace(), nullptr, + IsUnconstrained ? MaybeAlign() : Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); // Copy the old AI debug location over to the new one. NewAI->setDebugLoc(AI.getDebugLoc()); diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index 1d2e40bf62be7..9d088547b4369 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -82,6 +82,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerConstantIntrinsicsPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); + initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); initializeLowerWidenableConditionLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); @@ -89,6 +90,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); + initializeRedundantDbgInstEliminationPass(Registry); initializeRegToMemPass(Registry); initializeRewriteStatepointsForGCLegacyPassPass(Registry); initializeSCCPLegacyPassPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 2ee1a3a95f2a6..c25c6c632b8f0 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -21,6 +22,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -33,12 +35,12 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/Options.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/Scalarizer.h" #include <cassert> #include <cstdint> #include <iterator> @@ -173,8 +175,8 @@ struct VectorLayout { class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: - ScalarizerVisitor(unsigned ParallelLoopAccessMDKind) - : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) { + ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT) + : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) { } bool visit(Function &F); @@ -214,6 +216,8 @@ private: GatherList Gathered; unsigned ParallelLoopAccessMDKind; + + DominatorTree *DT; }; class ScalarizerLegacyPass : public FunctionPass { @@ -225,6 +229,11 @@ public: } bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } }; } // end anonymous namespace @@ -232,6 +241,7 @@ public: char ScalarizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) @@ -303,7 +313,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) { Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); return Impl.visit(F); } @@ -340,6 +351,15 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { return Scatterer(BB, BB->begin(), V, &Scattered[V]); } if (Instruction *VOp = dyn_cast<Instruction>(V)) { + // When scalarizing PHI nodes we might try to examine/rewrite InsertElement + // nodes in predecessors. If those predecessors are unreachable from entry, + // then the IR in those blocks could have unexpected properties resulting in + // infinite loops in Scatterer::operator[]. By simply treating values + // originating from instructions in unreachable blocks as undef we do not + // need to analyse them further. + if (!DT->isReachableFromEntry(VOp->getParent())) + return Scatterer(Point->getParent(), Point->getIterator(), + UndefValue::get(V->getType())); // Put the scattered form of an instruction directly after the // instruction. BasicBlock *BB = VOp->getParent(); @@ -856,7 +876,10 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); bool Changed = Impl.visit(F); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return Changed ? PA : PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 41554fccdf087..2a1a040bf83ee 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -164,7 +164,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -182,6 +181,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -189,6 +189,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <string> diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index ac832b9b45671..d7a34acb43184 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -38,8 +38,10 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" @@ -263,7 +265,7 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, /// to an entirely separate nest. static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, DominatorTree &DT, LoopInfo &LI, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { // If the loop is already at the top level, we can't hoist it anywhere. Loop *OldParentL = L.getParentLoop(); if (!OldParentL) @@ -317,7 +319,7 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, // Because we just hoisted a loop out of this one, we have essentially // created new exit paths from it. That means we need to form LCSSA PHI // nodes for values used in the no-longer-nested loop. - formLCSSA(*OldContainingL, DT, &LI, nullptr); + formLCSSA(*OldContainingL, DT, &LI, SE); // We shouldn't need to form dedicated exits because the exit introduced // here is the (just split by unswitching) preheader. However, after trivial @@ -329,6 +331,20 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, } } +// Return the top-most loop containing ExitBB and having ExitBB as exiting block +// or the loop containing ExitBB, if there is no parent loop containing ExitBB +// as exiting block. +static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) { + Loop *TopMost = LI.getLoopFor(ExitBB); + Loop *Current = TopMost; + while (Current) { + if (Current->isLoopExiting(ExitBB)) + TopMost = Current; + Current = Current->getParentLoop(); + } + return TopMost; +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -413,9 +429,10 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, }); // If we have scalar evolutions, we need to invalidate them including this - // loop and the loop containing the exit block. + // loop, the loop containing the exit block and the topmost parent loop + // exiting via LoopExitBB. if (SE) { - if (Loop *ExitL = LI.getLoopFor(LoopExitBB)) + if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) SE->forgetLoop(ExitL); else // Forget the entire nest as this exits the entire nest. @@ -532,7 +549,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // If this was full unswitching, we may have changed the nesting relationship // for this loop so hoist it to its correct parent if needed. if (FullUnswitch) - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -825,7 +842,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // We may have changed the nesting relationship for this loop so hoist it to // its correct parent if needed. - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -2260,7 +2277,7 @@ static void unswitchNontrivialInvariants( // First build LCSSA for this loop so that we can preserve it when // forming dedicated exits. We don't want to perturb some other loop's // LCSSA while doing that CFG edit. - formLCSSA(UpdateL, DT, &LI, nullptr); + formLCSSA(UpdateL, DT, &LI, SE); // For loops reached by this loop's original exit blocks we may // introduced new, non-dedicated exits. At least try to re-form dedicated @@ -2426,7 +2443,7 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, if (MSSAU) { MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); - MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End); + MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); } diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 4544975a4887b..623a8b711ed89 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -35,10 +34,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <utility> using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp index 90f3a2aa46e16..677d86f8c7b48 100644 --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -78,7 +79,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, if (auto *Call = dyn_cast<CallBase>(Inst)) { // Convergent operations cannot be made control-dependent on additional // values. - if (Call->hasFnAttr(Attribute::Convergent)) + if (Call->isConvergent()) return false; for (Instruction *S : Stores) diff --git a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp index e6db11f47eadd..cd7bfb2f20dc7 100644 --- a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -283,12 +283,12 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; if (IID) - FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(), - IncomingC->getType()); + FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(), + IncomingC->getType()); else FoldedCost += - TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(), - IncomingC->getType()); + TTI.getIntImmCostInst(UserI->getOpcode(), Idx, + IncomingC->getValue(), IncomingC->getType()); // If we accumulate more folded cost for this incoming constant than // materialized cost, then we'll regress any edge with this constant so diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index f9d027eb4a3b9..c8d899bb4871c 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -67,6 +67,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index a58c32cc58943..9f82b1263ebd1 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -60,7 +60,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -76,10 +75,12 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <limits> diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 9791cf41f6212..4ce4ce46f67ab 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -34,8 +34,10 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index b27a36b67d62e..9f0ab9103d429 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -76,6 +76,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp index 707adf46d1f4d..c8461fdc1608b 100644 --- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -12,6 +12,7 @@ #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; |
