diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Transforms/Scalar | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) | |
download | src-1d5ae1026e831016fc29fd927877c86af904481f.tar.gz src-1d5ae1026e831016fc29fd927877c86af904481f.zip |
Notes
Diffstat (limited to 'lib/Transforms/Scalar')
55 files changed, 3197 insertions, 919 deletions
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index de9a62e88c27..0e9f03a06061 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -93,9 +93,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, const SCEV *AlignSCEV, ScalarEvolution *SE) { // DiffUnits = Diff % int64_t(Alignment) - const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV); - const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV); - const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV); + const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV); LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); @@ -323,7 +321,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { LI->getPointerOperand(), SE); if (NewAlignment > LI->getAlignment()) { - LI->setAlignment(NewAlignment); + LI->setAlignment(MaybeAlign(NewAlignment)); ++NumLoadAlignChanged; } } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) { @@ -331,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlignment()) { - SI->setAlignment(NewAlignment); + SI->setAlignment(MaybeAlign(NewAlignment)); ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) { diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 3519b000a33f..c3fba923104f 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -562,7 +562,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { if (skipFunction(F)) return false; - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); return doCallSiteSplitting(F, TLI, TTI, DT); diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 98243a23f1ef..9f340afbf7c2 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -204,7 +204,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, /// set found in \p BBs. static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, BasicBlock *Entry, - SmallPtrSet<BasicBlock *, 8> &BBs) { + SetVector<BasicBlock *> &BBs) { assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); // Nodes on the current path to the root. SmallPtrSet<BasicBlock *, 8> Path; @@ -257,7 +257,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, // Visit Orders in bottom-up order. using InsertPtsCostPair = - std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>; + std::pair<SetVector<BasicBlock *>, BlockFrequency>; // InsertPtsMap is a map from a BB to the best insertion points for the // subtree of BB (subtree not including the BB itself). @@ -266,7 +266,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { BasicBlock *Node = *RIt; bool NodeInBBs = BBs.count(Node); - SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first; + auto &InsertPts = InsertPtsMap[Node].first; BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; // Return the optimal insert points in BBs. @@ -283,7 +283,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child // will update its parent's ParentInsertPts and ParentPtsFreq. - SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first; + auto &ParentInsertPts = InsertPtsMap[Parent].first; BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; // Choose to insert in Node or in subtree of Node. // Don't hoist to EHPad because we may not find a proper place to insert @@ -305,12 +305,12 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, } /// Find an insertion point that dominates all uses. -SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( +SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. - SmallPtrSet<BasicBlock *, 8> BBs; - SmallPtrSet<Instruction *, 8> InsertPts; + SetVector<BasicBlock *> BBs; + SetVector<Instruction *> InsertPts; for (auto const &RCI : ConstInfo.RebasedConstants) for (auto const &U : RCI.Uses) BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); @@ -333,15 +333,13 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( while (BBs.size() >= 2) { BasicBlock *BB, *BB1, *BB2; - BB1 = *BBs.begin(); - BB2 = *std::next(BBs.begin()); + BB1 = BBs.pop_back_val(); + BB2 = BBs.pop_back_val(); BB = DT->findNearestCommonDominator(BB1, BB2); if (BB == Entry) { InsertPts.insert(&Entry->front()); return InsertPts; } - BBs.erase(BB1); - BBs.erase(BB2); BBs.insert(BB); } assert((BBs.size() == 1) && "Expected only one element."); @@ -403,7 +401,7 @@ void ConstantHoistingPass::collectConstantCandidates( return; // Get offset from the base GV. - PointerType *GVPtrTy = dyn_cast<PointerType>(BaseGV->getType()); + PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType()); IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace()); APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true); auto *GEPO = cast<GEPOperator>(ConstExpr); @@ -830,7 +828,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec = BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec; for (auto const &ConstInfo : ConstInfoVec) { - SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo); + SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo); // We can have an empty set if the function contains unreachable blocks. if (IPSet.empty()) continue; diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 770321c740a0..e9e6afe3fdd4 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -82,7 +82,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { bool Changed = false; const DataLayout &DL = F.getParent()->getDataLayout(); TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); while (!WorkList.empty()) { SmallVector<Instruction*, 16> NewWorkListVec; diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 89497177524f..2ef85268df48 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -62,6 +62,23 @@ STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); STATISTIC(NumUDivs, "Number of udivs whose width was decreased"); STATISTIC(NumAShrs, "Number of ashr converted to lshr"); STATISTIC(NumSRems, "Number of srem converted to urem"); +STATISTIC(NumSExt, "Number of sext converted to zext"); +STATISTIC(NumAnd, "Number of ands removed"); +STATISTIC(NumNW, "Number of no-wrap deductions"); +STATISTIC(NumNSW, "Number of no-signed-wrap deductions"); +STATISTIC(NumNUW, "Number of no-unsigned-wrap deductions"); +STATISTIC(NumAddNW, "Number of no-wrap deductions for add"); +STATISTIC(NumAddNSW, "Number of no-signed-wrap deductions for add"); +STATISTIC(NumAddNUW, "Number of no-unsigned-wrap deductions for add"); +STATISTIC(NumSubNW, "Number of no-wrap deductions for sub"); +STATISTIC(NumSubNSW, "Number of no-signed-wrap deductions for sub"); +STATISTIC(NumSubNUW, "Number of no-unsigned-wrap deductions for sub"); +STATISTIC(NumMulNW, "Number of no-wrap deductions for mul"); +STATISTIC(NumMulNSW, "Number of no-signed-wrap deductions for mul"); +STATISTIC(NumMulNUW, "Number of no-unsigned-wrap deductions for mul"); +STATISTIC(NumShlNW, "Number of no-wrap deductions for shl"); +STATISTIC(NumShlNSW, "Number of no-signed-wrap deductions for shl"); +STATISTIC(NumShlNUW, "Number of no-unsigned-wrap deductions for shl"); STATISTIC(NumOverflows, "Number of overflow checks removed"); STATISTIC(NumSaturating, "Number of saturating arithmetics converted to normal arithmetics"); @@ -85,6 +102,7 @@ namespace { AU.addRequired<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LazyValueInfoWrapperPass>(); } }; @@ -416,37 +434,96 @@ static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) { return NWRegion.contains(LRange); } -static void processOverflowIntrinsic(WithOverflowInst *WO) { - IRBuilder<> B(WO); - Value *NewOp = B.CreateBinOp( - WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), WO->getName()); - // Constant-folding could have happened. - if (auto *Inst = dyn_cast<Instruction>(NewOp)) { - if (WO->isSigned()) +static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode, + bool NewNSW, bool NewNUW) { + Statistic *OpcNW, *OpcNSW, *OpcNUW; + switch (Opcode) { + case Instruction::Add: + OpcNW = &NumAddNW; + OpcNSW = &NumAddNSW; + OpcNUW = &NumAddNUW; + break; + case Instruction::Sub: + OpcNW = &NumSubNW; + OpcNSW = &NumSubNSW; + OpcNUW = &NumSubNUW; + break; + case Instruction::Mul: + OpcNW = &NumMulNW; + OpcNSW = &NumMulNSW; + OpcNUW = &NumMulNUW; + break; + case Instruction::Shl: + OpcNW = &NumShlNW; + OpcNSW = &NumShlNSW; + OpcNUW = &NumShlNUW; + break; + default: + llvm_unreachable("Will not be called with other binops"); + } + + auto *Inst = dyn_cast<Instruction>(V); + if (NewNSW) { + ++NumNW; + ++*OpcNW; + ++NumNSW; + ++*OpcNSW; + if (Inst) Inst->setHasNoSignedWrap(); - else + } + if (NewNUW) { + ++NumNW; + ++*OpcNW; + ++NumNUW; + ++*OpcNUW; + if (Inst) Inst->setHasNoUnsignedWrap(); } +} - Value *NewI = B.CreateInsertValue(UndefValue::get(WO->getType()), NewOp, 0); - NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(WO->getContext()), 1); +static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI); + +// Rewrite this with.overflow intrinsic as non-overflowing. +static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) { + IRBuilder<> B(WO); + Instruction::BinaryOps Opcode = WO->getBinaryOp(); + bool NSW = WO->isSigned(); + bool NUW = !WO->isSigned(); + + Value *NewOp = + B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName()); + setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW); + + StructType *ST = cast<StructType>(WO->getType()); + Constant *Struct = ConstantStruct::get(ST, + { UndefValue::get(ST->getElementType(0)), + ConstantInt::getFalse(ST->getElementType(1)) }); + Value *NewI = B.CreateInsertValue(Struct, NewOp, 0); WO->replaceAllUsesWith(NewI); WO->eraseFromParent(); ++NumOverflows; + + // See if we can infer the other no-wrap too. + if (auto *BO = dyn_cast<BinaryOperator>(NewOp)) + processBinOp(BO, LVI); } -static void processSaturatingInst(SaturatingInst *SI) { +static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) { + Instruction::BinaryOps Opcode = SI->getBinaryOp(); + bool NSW = SI->isSigned(); + bool NUW = !SI->isSigned(); BinaryOperator *BinOp = BinaryOperator::Create( - SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI); + Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI); BinOp->setDebugLoc(SI->getDebugLoc()); - if (SI->isSigned()) - BinOp->setHasNoSignedWrap(); - else - BinOp->setHasNoUnsignedWrap(); + setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW); SI->replaceAllUsesWith(BinOp); SI->eraseFromParent(); ++NumSaturating; + + // See if we can infer the other no-wrap too. + if (auto *BO = dyn_cast<BinaryOperator>(BinOp)) + processBinOp(BO, LVI); } /// Infer nonnull attributes for the arguments at the specified callsite. @@ -456,14 +533,14 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) { if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) { - processOverflowIntrinsic(WO); + processOverflowIntrinsic(WO, LVI); return true; } } if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) { if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) { - processSaturatingInst(SI); + processSaturatingInst(SI, LVI); return true; } } @@ -632,6 +709,27 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { return true; } +static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { + if (SDI->getType()->isVectorTy()) + return false; + + Value *Base = SDI->getOperand(0); + + Constant *Zero = ConstantInt::get(Base->getType(), 0); + if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) != + LazyValueInfo::True) + return false; + + ++NumSExt; + auto *ZExt = + CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI); + ZExt->setDebugLoc(SDI->getDebugLoc()); + SDI->replaceAllUsesWith(ZExt); + SDI->eraseFromParent(); + + return true; +} + static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { using OBO = OverflowingBinaryOperator; @@ -648,6 +746,7 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { BasicBlock *BB = BinOp->getParent(); + Instruction::BinaryOps Opcode = BinOp->getOpcode(); Value *LHS = BinOp->getOperand(0); Value *RHS = BinOp->getOperand(1); @@ -655,24 +754,48 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp); bool Changed = false; + bool NewNUW = false, NewNSW = false; if (!NUW) { ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( - BinOp->getOpcode(), RRange, OBO::NoUnsignedWrap); - bool NewNUW = NUWRange.contains(LRange); - BinOp->setHasNoUnsignedWrap(NewNUW); + Opcode, RRange, OBO::NoUnsignedWrap); + NewNUW = NUWRange.contains(LRange); Changed |= NewNUW; } if (!NSW) { ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( - BinOp->getOpcode(), RRange, OBO::NoSignedWrap); - bool NewNSW = NSWRange.contains(LRange); - BinOp->setHasNoSignedWrap(NewNSW); + Opcode, RRange, OBO::NoSignedWrap); + NewNSW = NSWRange.contains(LRange); Changed |= NewNSW; } + setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW); + return Changed; } +static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) { + if (BinOp->getType()->isVectorTy()) + return false; + + // Pattern match (and lhs, C) where C includes a superset of bits which might + // be set in lhs. This is a common truncation idiom created by instcombine. + BasicBlock *BB = BinOp->getParent(); + Value *LHS = BinOp->getOperand(0); + ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1)); + if (!RHS || !RHS->getValue().isMask()) + return false; + + ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp); + if (!LRange.getUnsignedMax().ule(RHS->getValue())) + return false; + + BinOp->replaceAllUsesWith(LHS); + BinOp->eraseFromParent(); + NumAnd++; + return true; +} + + static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (Constant *C = LVI->getConstant(V, At->getParent(), At)) return C; @@ -740,10 +863,18 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, case Instruction::AShr: BBChanged |= processAShr(cast<BinaryOperator>(II), LVI); break; + case Instruction::SExt: + BBChanged |= processSExt(cast<SExtInst>(II), LVI); + break; case Instruction::Add: case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI); break; + case Instruction::And: + BBChanged |= processAnd(cast<BinaryOperator>(II), LVI); + break; } } @@ -796,5 +927,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { PreservedAnalyses PA; PA.preserve<GlobalsAA>(); PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LazyValueAnalysis>(); return PA; } diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 479e0ed74074..a79d775aa7f3 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -38,17 +38,19 @@ namespace { //===--------------------------------------------------------------------===// // DeadInstElimination pass implementation // - struct DeadInstElimination : public BasicBlockPass { - static char ID; // Pass identification, replacement for typeid - DeadInstElimination() : BasicBlockPass(ID) { - initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); - } - bool runOnBasicBlock(BasicBlock &BB) override { - if (skipBasicBlock(BB)) - return false; - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - bool Changed = false; +struct DeadInstElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + DeadInstElimination() : FunctionPass(ID) { + initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + + bool Changed = false; + for (auto &BB : F) { for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { @@ -60,13 +62,14 @@ namespace { ++DIEEliminated; } } - return Changed; } + return Changed; + } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - }; +}; } char DeadInstElimination::ID = 0; @@ -154,7 +157,7 @@ struct DCELegacyPass : public FunctionPass { return false; auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; return eliminateDeadCode(F, TLI); } diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index a81645745b48..685de82810ed 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1254,8 +1254,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, auto *SI = new StoreInst( ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), - Earlier->getPointerOperand(), false, Earlier->getAlignment(), - Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite); + Earlier->getPointerOperand(), false, + MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(), + Earlier->getSyncScopeID(), DepWrite); unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, @@ -1361,7 +1362,7 @@ public: MemoryDependenceResults *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); return eliminateDeadStores(F, AA, MD, DT, TLI); } diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp index 876681b4f9de..934853507478 100644 --- a/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/lib/Transforms/Scalar/DivRemPairs.cpp @@ -1,4 +1,4 @@ -//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===// +//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This pass hoists and/or decomposes integer division and remainder +// This pass hoists and/or decomposes/recomposes integer division and remainder // instructions to enable CFG improvements and better codegen. // //===----------------------------------------------------------------------===// @@ -19,37 +19,105 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" + using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "div-rem-pairs" STATISTIC(NumPairs, "Number of div/rem pairs"); +STATISTIC(NumRecomposed, "Number of instructions recomposed"); STATISTIC(NumHoisted, "Number of instructions hoisted"); STATISTIC(NumDecomposed, "Number of instructions decomposed"); DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform", "Controls transformations in div-rem-pairs pass"); -/// Find matching pairs of integer div/rem ops (they have the same numerator, -/// denominator, and signedness). If they exist in different basic blocks, bring -/// them together by hoisting or replace the common division operation that is -/// implicit in the remainder: -/// X % Y <--> X - ((X / Y) * Y). -/// -/// We can largely ignore the normal safety and cost constraints on speculation -/// of these ops when we find a matching pair. This is because we are already -/// guaranteed that any exceptions and most cost are already incurred by the -/// first member of the pair. -/// -/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or -/// SimplifyCFG, but it's split off on its own because it's different enough -/// that it doesn't quite match the stated objectives of those passes. -static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, - const DominatorTree &DT) { - bool Changed = false; +namespace { +struct ExpandedMatch { + DivRemMapKey Key; + Instruction *Value; +}; +} // namespace + +/// See if we can match: (which is the form we expand into) +/// X - ((X ?/ Y) * Y) +/// which is equivalent to: +/// X ?% Y +static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) { + Value *Dividend, *XroundedDownToMultipleOfY; + if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY)))) + return llvm::None; + + Value *Divisor; + Instruction *Div; + // Look for ((X / Y) * Y) + if (!match( + XroundedDownToMultipleOfY, + m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)), + m_Instruction(Div)), + m_Deferred(Divisor)))) + return llvm::None; + + ExpandedMatch M; + M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv; + M.Key.Dividend = Dividend; + M.Key.Divisor = Divisor; + M.Value = &I; + return M; +} + +/// A thin wrapper to store two values that we matched as div-rem pair. +/// We want this extra indirection to avoid dealing with RAUW'ing the map keys. +struct DivRemPairWorklistEntry { + /// The actual udiv/sdiv instruction. Source of truth. + AssertingVH<Instruction> DivInst; + + /// The instruction that we have matched as a remainder instruction. + /// Should only be used as Value, don't introspect it. + AssertingVH<Instruction> RemInst; + + DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_) + : DivInst(DivInst_), RemInst(RemInst_) { + assert((DivInst->getOpcode() == Instruction::UDiv || + DivInst->getOpcode() == Instruction::SDiv) && + "Not a division."); + assert(DivInst->getType() == RemInst->getType() && "Types should match."); + // We can't check anything else about remainder instruction, + // it's not strictly required to be a urem/srem. + } + /// The type for this pair, identical for both the div and rem. + Type *getType() const { return DivInst->getType(); } + + /// Is this pair signed or unsigned? + bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; } + + /// In this pair, what are the divident and divisor? + Value *getDividend() const { return DivInst->getOperand(0); } + Value *getDivisor() const { return DivInst->getOperand(1); } + + bool isRemExpanded() const { + switch (RemInst->getOpcode()) { + case Instruction::SRem: + case Instruction::URem: + return false; // single 'rem' instruction - unexpanded form. + default: + return true; // anything else means we have remainder in expanded form. + } + } +}; +using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>; + +/// Find matching pairs of integer div/rem ops (they have the same numerator, +/// denominator, and signedness). Place those pairs into a worklist for further +/// processing. This indirection is needed because we have to use TrackingVH<> +/// because we will be doing RAUW, and if one of the rem instructions we change +/// happens to be an input to another div/rem in the maps, we'd have problems. +static DivRemWorklistTy getWorklist(Function &F) { // Insert all divide and remainder instructions into maps keyed by their // operands and opcode (signed or unsigned). DenseMap<DivRemMapKey, Instruction *> DivMap; @@ -66,9 +134,14 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; else if (I.getOpcode() == Instruction::URem) RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; + else if (auto Match = matchExpandedRem(I)) + RemMap[Match->Key] = Match->Value; } } + // We'll accumulate the matching pairs of div-rem instructions here. + DivRemWorklistTy Worklist; + // We can iterate over either map because we are only looking for matched // pairs. Choose remainders for efficiency because they are usually even more // rare than division. @@ -78,12 +151,77 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, if (!DivInst) continue; - // We have a matching pair of div/rem instructions. If one dominates the - // other, hoist and/or replace one. + // We have a matching pair of div/rem instructions. NumPairs++; Instruction *RemInst = RemPair.second; - bool IsSigned = DivInst->getOpcode() == Instruction::SDiv; - bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned); + + // Place it in the worklist. + Worklist.emplace_back(DivInst, RemInst); + } + + return Worklist; +} + +/// Find matching pairs of integer div/rem ops (they have the same numerator, +/// denominator, and signedness). If they exist in different basic blocks, bring +/// them together by hoisting or replace the common division operation that is +/// implicit in the remainder: +/// X % Y <--> X - ((X / Y) * Y). +/// +/// We can largely ignore the normal safety and cost constraints on speculation +/// of these ops when we find a matching pair. This is because we are already +/// guaranteed that any exceptions and most cost are already incurred by the +/// first member of the pair. +/// +/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or +/// SimplifyCFG, but it's split off on its own because it's different enough +/// that it doesn't quite match the stated objectives of those passes. +static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, + const DominatorTree &DT) { + bool Changed = false; + + // Get the matching pairs of div-rem instructions. We want this extra + // indirection to avoid dealing with having to RAUW the keys of the maps. + DivRemWorklistTy Worklist = getWorklist(F); + + // Process each entry in the worklist. + for (DivRemPairWorklistEntry &E : Worklist) { + if (!DebugCounter::shouldExecute(DRPCounter)) + continue; + + bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned()); + + auto &DivInst = E.DivInst; + auto &RemInst = E.RemInst; + + const bool RemOriginallyWasInExpandedForm = E.isRemExpanded(); + (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning + + if (HasDivRemOp && E.isRemExpanded()) { + // The target supports div+rem but the rem is expanded. + // We should recompose it first. + Value *X = E.getDividend(); + Value *Y = E.getDivisor(); + Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y) + : BinaryOperator::CreateURem(X, Y); + // Note that we place it right next to the original expanded instruction, + // and letting further handling to move it if needed. + RealRem->setName(RemInst->getName() + ".recomposed"); + RealRem->insertAfter(RemInst); + Instruction *OrigRemInst = RemInst; + // Update AssertingVH<> with new instruction so it doesn't assert. + RemInst = RealRem; + // And replace the original instruction with the new one. + OrigRemInst->replaceAllUsesWith(RealRem); + OrigRemInst->eraseFromParent(); + NumRecomposed++; + // Note that we have left ((X / Y) * Y) around. + // If it had other uses we could rewrite it as X - X % Y + } + + assert((!E.isRemExpanded() || !HasDivRemOp) && + "*If* the target supports div-rem, then by now the RemInst *is* " + "Instruction::[US]Rem."); // If the target supports div+rem and the instructions are in the same block // already, there's nothing to do. The backend should handle this. If the @@ -92,10 +230,16 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, continue; bool DivDominates = DT.dominates(DivInst, RemInst); - if (!DivDominates && !DT.dominates(RemInst, DivInst)) + if (!DivDominates && !DT.dominates(RemInst, DivInst)) { + // We have matching div-rem pair, but they are in two different blocks, + // neither of which dominates one another. + // FIXME: We could hoist both ops to the common predecessor block? continue; + } - if (!DebugCounter::shouldExecute(DRPCounter)) + // The target does not have a single div/rem operation, + // and the rem is already in expanded form. Nothing to do. + if (!HasDivRemOp && E.isRemExpanded()) continue; if (HasDivRemOp) { @@ -107,11 +251,17 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, DivInst->moveAfter(RemInst); NumHoisted++; } else { - // The target does not have a single div/rem operation. Decompose the - // remainder calculation as: + // The target does not have a single div/rem operation, + // and the rem is *not* in a already-expanded form. + // Decompose the remainder calculation as: // X % Y --> X - ((X / Y) * Y). - Value *X = RemInst->getOperand(0); - Value *Y = RemInst->getOperand(1); + + assert(!RemOriginallyWasInExpandedForm && + "We should not be expanding if the rem was in expanded form to " + "begin with."); + + Value *X = E.getDividend(); + Value *Y = E.getDivisor(); Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y); Instruction *Sub = BinaryOperator::CreateSub(X, Mul); @@ -152,8 +302,13 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, // Now kill the explicit remainder. We have replaced it with: // (sub X, (mul (div X, Y), Y) - RemInst->replaceAllUsesWith(Sub); - RemInst->eraseFromParent(); + Sub->setName(RemInst->getName() + ".decomposed"); + Instruction *OrigRemInst = RemInst; + // Update AssertingVH<> with new instruction so it doesn't assert. + RemInst = Sub; + // And replace the original instruction with the new one. + OrigRemInst->replaceAllUsesWith(Sub); + OrigRemInst->eraseFromParent(); NumDecomposed++; } Changed = true; @@ -188,7 +343,7 @@ struct DivRemPairsLegacyPass : public FunctionPass { return optimizeDivRem(F, TTI, DT); } }; -} +} // namespace char DivRemPairsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs", diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index f1f075257020..ce540683dae2 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -108,11 +108,12 @@ struct SimpleValue { // This can only handle non-void readnone functions. if (CallInst *CI = dyn_cast<CallInst>(Inst)) return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); - return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || - isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || - isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) || + isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || + isa<CmpInst>(Inst) || isa<SelectInst>(Inst) || + isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || + isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) || + isa<InsertValueInst>(Inst); } }; @@ -240,7 +241,7 @@ static unsigned getHashValueImpl(SimpleValue Val) { assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) || isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || - isa<ShuffleVectorInst>(Inst)) && + isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) && "Invalid/unknown instruction"); // Mix in the opcode. @@ -526,7 +527,7 @@ public: const TargetTransformInfo &TTI, DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA), - MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} + MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(); @@ -651,7 +652,7 @@ private: bool isInvariantLoad() const { if (auto *LI = dyn_cast<LoadInst>(Inst)) - return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr; + return LI->hasMetadata(LLVMContext::MD_invariant_load); return false; } @@ -790,7 +791,7 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) { // A location loaded from with an invariant_load is assumed to *never* change // within the visible scope of the compilation. if (auto *LI = dyn_cast<LoadInst>(I)) - if (LI->getMetadata(LLVMContext::MD_invariant_load)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; auto MemLocOpt = MemoryLocation::getOrNone(I); @@ -1359,7 +1360,7 @@ public: if (skipFunction(F)) return false; - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -1381,6 +1382,7 @@ public: AU.addPreserved<MemorySSAWrapperPass>(); } AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); AU.setPreservesCFG(); } }; diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp index 31670b1464e4..e6abf1ceb026 100644 --- a/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,10 +11,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + using namespace llvm; #define DEBUG_TYPE "flattencfg" @@ -52,15 +54,23 @@ FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { bool Changed = false; bool LocalChange = true; + + // Use block handles instead of iterating over function blocks directly + // to avoid using iterators invalidated by erasing blocks. + std::vector<WeakVH> Blocks; + Blocks.reserve(F.size()); + for (auto &BB : F) + Blocks.push_back(&BB); + while (LocalChange) { LocalChange = false; - // Loop over all of the basic blocks and remove them if they are unneeded... - // - for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { - if (FlattenCFG(&*BBIt++, AA)) { - LocalChange = true; - } + // Loop over all of the basic blocks and try to flatten them. + for (WeakVH &BlockHandle : Blocks) { + // Skip blocks erased by FlattenCFG. + if (auto *BB = cast_or_null<BasicBlock>(BlockHandle)) + if (FlattenCFG(BB, AA)) + LocalChange = true; } Changed |= LocalChange; } diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp index 4f83e869b303..4d2eac0451df 100644 --- a/lib/Transforms/Scalar/Float2Int.cpp +++ b/lib/Transforms/Scalar/Float2Int.cpp @@ -60,11 +60,13 @@ namespace { if (skipFunction(F)) return false; - return Impl.runImpl(F); + const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return Impl.runImpl(F, DT); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -116,21 +118,29 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. -void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { - for (auto &I : instructions(F)) { - if (isa<VectorType>(I.getType())) +void Float2IntPass::findRoots(Function &F, const DominatorTree &DT, + SmallPtrSet<Instruction*,8> &Roots) { + for (BasicBlock &BB : F) { + // Unreachable code can take on strange forms that we are not prepared to + // handle. For example, an instruction may have itself as an operand. + if (!DT.isReachableFromEntry(&BB)) continue; - switch (I.getOpcode()) { - default: break; - case Instruction::FPToUI: - case Instruction::FPToSI: - Roots.insert(&I); - break; - case Instruction::FCmp: - if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != - CmpInst::BAD_ICMP_PREDICATE) + + for (Instruction &I : BB) { + if (isa<VectorType>(I.getType())) + continue; + switch (I.getOpcode()) { + default: break; + case Instruction::FPToUI: + case Instruction::FPToSI: Roots.insert(&I); - break; + break; + case Instruction::FCmp: + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + CmpInst::BAD_ICMP_PREDICATE) + Roots.insert(&I); + break; + } } } } @@ -503,7 +513,7 @@ void Float2IntPass::cleanup() { I.first->eraseFromParent(); } -bool Float2IntPass::runImpl(Function &F) { +bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); // Clear out all state. ECs = EquivalenceClasses<Instruction*>(); @@ -513,7 +523,7 @@ bool Float2IntPass::runImpl(Function &F) { Ctx = &F.getParent()->getContext(); - findRoots(F, Roots); + findRoots(F, DT, Roots); walkBackwards(Roots); walkForwards(); @@ -527,8 +537,9 @@ bool Float2IntPass::runImpl(Function &F) { namespace llvm { FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); } -PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) { - if (!runImpl(F)) +PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) { + const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + if (!runImpl(F, DT)) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 1a02e9d33f49..743353eaea22 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -70,6 +70,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -626,6 +627,8 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve<DominatorTreeAnalysis>(); PA.preserve<GlobalsAA>(); PA.preserve<TargetLibraryAnalysis>(); + if (LI) + PA.preserve<LoopAnalysis>(); return PA; } @@ -1161,15 +1164,30 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Do PHI translation to get its value in the predecessor if necessary. The // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. + // We do the translation for each edge we skipped by going from LI's block + // to LoadBB, otherwise we might miss pieces needing translation. // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), DL, AC); - Value *LoadPtr = nullptr; - LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, - *DT, NewInsts); + Value *LoadPtr = LI->getPointerOperand(); + BasicBlock *Cur = LI->getParent(); + while (Cur != LoadBB) { + PHITransAddr Address(LoadPtr, DL, AC); + LoadPtr = Address.PHITranslateWithInsertion( + Cur, Cur->getSinglePredecessor(), *DT, NewInsts); + if (!LoadPtr) { + CanDoPRE = false; + break; + } + Cur = Cur->getSinglePredecessor(); + } + if (LoadPtr) { + PHITransAddr Address(LoadPtr, DL, AC); + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, + NewInsts); + } // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. if (!LoadPtr) { @@ -1184,8 +1202,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (!CanDoPRE) { while (!NewInsts.empty()) { - Instruction *I = NewInsts.pop_back_val(); - markInstructionForDeletion(I); + // Erase instructions generated by the failed PHI translation before + // trying to number them. PHI translation might insert instructions + // in basic blocks other than the current one, and we delete them + // directly, as markInstructionForDeletion only allows removing from the + // current basic block. + NewInsts.pop_back_val()->eraseFromParent(); } // HINT: Don't revert the edge-splitting as following transformation may // also need to split these critical edges. @@ -1219,10 +1241,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, BasicBlock *UnavailablePred = PredLoad.first; Value *LoadPtr = PredLoad.second; - auto *NewLoad = - new LoadInst(LI->getType(), LoadPtr, LI->getName() + ".pre", - LI->isVolatile(), LI->getAlignment(), LI->getOrdering(), - LI->getSyncScopeID(), UnavailablePred->getTerminator()); + auto *NewLoad = new LoadInst( + LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(), + MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(), + UnavailablePred->getTerminator()); NewLoad->setDebugLoc(LI->getDebugLoc()); // Transfer the old load's AA tags to the new load. @@ -1365,6 +1387,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +static bool hasUsersIn(Value *V, BasicBlock *BB) { + for (User *U : V->users()) + if (isa<Instruction>(U) && + cast<Instruction>(U)->getParent() == BB) + return true; + return false; +} + bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && "This function can only be called with llvm.assume intrinsic"); @@ -1403,12 +1433,23 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { // We can replace assume value with true, which covers cases like this: // call void @llvm.assume(i1 %cmp) // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true - ReplaceWithConstMap[V] = True; - - // If one of *cmp *eq operand is const, adding it to map will cover this: + ReplaceOperandsWithMap[V] = True; + + // If we find an equality fact, canonicalize all dominated uses in this block + // to one of the two values. We heuristically choice the "oldest" of the + // two where age is determined by value number. (Note that propagateEquality + // above handles the cross block case.) + // + // Key case to cover are: + // 1) // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen // call void @llvm.assume(i1 %cmp) // ret float %0 ; will change it to ret float 3.000000e+00 + // 2) + // %load = load float, float* %addr + // %cmp = fcmp oeq float %load, %0 + // call void @llvm.assume(i1 %cmp) + // ret float %load ; will change it to ret float %0 if (auto *CmpI = dyn_cast<CmpInst>(V)) { if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || @@ -1416,13 +1457,50 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { CmpI->getFastMathFlags().noNaNs())) { Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); - if (isa<Constant>(CmpLHS)) + // Heuristically pick the better replacement -- the choice of heuristic + // isn't terribly important here, but the fact we canonicalize on some + // replacement is for exposing other simplifications. + // TODO: pull this out as a helper function and reuse w/existing + // (slightly different) logic. + if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS)) std::swap(CmpLHS, CmpRHS); - auto *RHSConst = dyn_cast<Constant>(CmpRHS); + if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS)) + std::swap(CmpLHS, CmpRHS); + if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) || + (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) { + // Move the 'oldest' value to the right-hand side, using the value + // number as a proxy for age. + uint32_t LVN = VN.lookupOrAdd(CmpLHS); + uint32_t RVN = VN.lookupOrAdd(CmpRHS); + if (LVN < RVN) + std::swap(CmpLHS, CmpRHS); + } - // If only one operand is constant. - if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) - ReplaceWithConstMap[CmpLHS] = RHSConst; + // Handle degenerate case where we either haven't pruned a dead path or a + // removed a trivial assume yet. + if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) + return Changed; + + // +0.0 and -0.0 compare equal, but do not imply equivalence. Unless we + // can prove equivalence, bail. + if (CmpRHS->getType()->isFloatTy() && + (!isa<ConstantFP>(CmpRHS) || cast<ConstantFP>(CmpRHS)->isZero())) + return Changed; + + LLVM_DEBUG(dbgs() << "Replacing dominated uses of " + << *CmpLHS << " with " + << *CmpRHS << " in block " + << IntrinsicI->getParent()->getName() << "\n"); + + + // Setup the replacement map - this handles uses within the same block + if (hasUsersIn(CmpLHS, IntrinsicI->getParent())) + ReplaceOperandsWithMap[CmpLHS] = CmpRHS; + + // NOTE: The non-block local cases are handled by the call to + // propagateEquality above; this block is just about handling the block + // local cases. TODO: There's a bunch of logic in propagateEqualiy which + // isn't duplicated for the block local case, can we share it somehow? } } return Changed; @@ -1522,6 +1600,41 @@ uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, return NewNum; } +// Return true if the value number \p Num and NewNum have equal value. +// Return false if the result is unknown. +bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum, + const BasicBlock *Pred, + const BasicBlock *PhiBlock, GVN &Gvn) { + CallInst *Call = nullptr; + LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; + while (Vals) { + Call = dyn_cast<CallInst>(Vals->Val); + if (Call && Call->getParent() == PhiBlock) + break; + Vals = Vals->Next; + } + + if (AA->doesNotAccessMemory(Call)) + return true; + + if (!MD || !AA->onlyReadsMemory(Call)) + return false; + + MemDepResult local_dep = MD->getDependency(Call); + if (!local_dep.isNonLocal()) + return false; + + const MemoryDependenceResults::NonLocalDepInfo &deps = + MD->getNonLocalCallDependency(Call); + + // Check to see if the Call has no function local clobber. + for (unsigned i = 0; i < deps.size(); i++) { + if (deps[i].getResult().isNonFuncLocal()) + return true; + } + return false; +} + /// Translate value number \p Num using phis, so that it has the values of /// the phis in BB. uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, @@ -1568,8 +1681,11 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, } } - if (uint32_t NewNum = expressionNumbering[Exp]) + if (uint32_t NewNum = expressionNumbering[Exp]) { + if (Exp.opcode == Instruction::Call && NewNum != Num) + return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num; return NewNum; + } return Num; } @@ -1637,16 +1753,12 @@ void GVN::assignBlockRPONumber(Function &F) { InvalidBlockRPONumbers = false; } -// Tries to replace instruction with const, using information from -// ReplaceWithConstMap. -bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { +bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const { bool Changed = false; for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { - Value *Operand = Instr->getOperand(OpNum); - auto it = ReplaceWithConstMap.find(Operand); - if (it != ReplaceWithConstMap.end()) { - assert(!isa<Constant>(Operand) && - "Replacing constants with constants is invalid"); + Value *Operand = Instr->getOperand(OpNum); + auto it = ReplaceOperandsWithMap.find(Operand); + if (it != ReplaceOperandsWithMap.end()) { LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second << " in instruction " << *Instr << '\n'); Instr->setOperand(OpNum, it->second); @@ -1976,6 +2088,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, MD = RunMD; ImplicitControlFlowTracking ImplicitCFT(DT); ICF = &ImplicitCFT; + this->LI = LI; VN.setMemDep(MD); ORE = RunORE; InvalidBlockRPONumbers = true; @@ -2037,13 +2150,13 @@ bool GVN::processBlock(BasicBlock *BB) { return false; // Clearing map before every BB because it can be used only for single BB. - ReplaceWithConstMap.clear(); + ReplaceOperandsWithMap.clear(); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - if (!ReplaceWithConstMap.empty()) - ChangedFunction |= replaceOperandsWithConsts(&*BI); + if (!ReplaceOperandsWithMap.empty()) + ChangedFunction |= replaceOperandsForInBlockEquality(&*BI); ChangedFunction |= processInstruction(&*BI); if (InstrsToErase.empty()) { @@ -2335,7 +2448,7 @@ bool GVN::performPRE(Function &F) { /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { BasicBlock *BB = - SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT)); + SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI)); if (MD) MD->invalidateCachedPredecessors(); InvalidBlockRPONumbers = true; @@ -2350,7 +2463,7 @@ bool GVN::splitCriticalEdges() { do { std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, - CriticalEdgeSplittingOptions(DT)); + CriticalEdgeSplittingOptions(DT, LI)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); InvalidBlockRPONumbers = true; @@ -2456,18 +2569,26 @@ void GVN::addDeadBlock(BasicBlock *BB) { if (DeadBlocks.count(B)) continue; + // First, split the critical edges. This might also create additional blocks + // to preserve LoopSimplify form and adjust edges accordingly. SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); for (BasicBlock *P : Preds) { if (!DeadBlocks.count(P)) continue; - if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) { + if (llvm::any_of(successors(P), + [B](BasicBlock *Succ) { return Succ == B; }) && + isCriticalEdge(P->getTerminator(), B)) { if (BasicBlock *S = splitCriticalEdges(P, B)) DeadBlocks.insert(P = S); } + } - for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) { - PHINode &Phi = cast<PHINode>(*II); + // Now undef the incoming values from the dead predecessors. + for (BasicBlock *P : predecessors(B)) { + if (!DeadBlocks.count(P)) + continue; + for (PHINode &Phi : B->phis()) { Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType())); if (MD) MD->invalidateCachedPointerInfo(&Phi); @@ -2544,10 +2665,11 @@ public: return Impl.runImpl( F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), getAnalysis<AAResultsWrapperPass>().getAAResults(), - NoMemDepAnalysis ? nullptr - : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(), + NoMemDepAnalysis + ? nullptr + : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(), LIWP ? &LIWP->getLoopInfo() : nullptr, &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE()); } @@ -2556,6 +2678,7 @@ public: AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); if (!NoMemDepAnalysis) AU.addRequired<MemoryDependenceWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); @@ -2563,6 +2686,8 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreservedID(LoopSimplifyID); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index 7614599653c4..c87e41484b13 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -257,7 +257,7 @@ public: GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA, MemoryDependenceResults *MD, MemorySSA *MSSA) : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA), - MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} + MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(Function &F) { NumFuncArgs = F.arg_size(); @@ -539,7 +539,7 @@ private: // Check for unsafe hoistings due to side effects. if (K == InsKind::Store) { - if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths)) + if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths)) return false; } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths)) return false; @@ -889,19 +889,18 @@ private: void updateAlignment(Instruction *I, Instruction *Repl) { if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) { - ReplacementLoad->setAlignment( - std::min(ReplacementLoad->getAlignment(), - cast<LoadInst>(I)->getAlignment())); + ReplacementLoad->setAlignment(MaybeAlign(std::min( + ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment()))); ++NumLoadsRemoved; } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) { ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast<StoreInst>(I)->getAlignment())); + MaybeAlign(std::min(ReplacementStore->getAlignment(), + cast<StoreInst>(I)->getAlignment()))); ++NumStoresRemoved; } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) { ReplacementAlloca->setAlignment( - std::max(ReplacementAlloca->getAlignment(), - cast<AllocaInst>(I)->getAlignment())); + MaybeAlign(std::max(ReplacementAlloca->getAlignment(), + cast<AllocaInst>(I)->getAlignment()))); } else if (isa<CallInst>(Repl)) { ++NumCallsRemoved; } diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp index e14f44bb7069..2697d7809568 100644 --- a/lib/Transforms/Scalar/GuardWidening.cpp +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -591,7 +591,7 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, else Result = RC.getCheckInst(); } - + assert(Result && "Failed to find result value"); Result->setName("wide.chk"); } return true; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index f9fc698a4a9b..5519a00c12c9 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -124,6 +124,11 @@ static cl::opt<bool> DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), cl::desc("Disable Linear Function Test Replace optimization")); +static cl::opt<bool> +LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(false), + cl::desc("Predicate conditions in read only loops")); + + namespace { struct RewritePhi; @@ -144,7 +149,11 @@ class IndVarSimplify { bool rewriteNonIntegerIVs(Loop *L); bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); - bool optimizeLoopExits(Loop *L); + /// Try to eliminate loop exits based on analyzeable exit counts + bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); + /// Try to form loop invariant tests for loop exits by changing how many + /// iterations of the loop run when that is unobservable. + bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); @@ -628,12 +637,30 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Okay, this instruction has a user outside of the current loop // and varies predictably *inside* the loop. Evaluate the value it - // contains when the loop exits, if possible. + // contains when the loop exits, if possible. We prefer to start with + // expressions which are true for all exits (so as to maximize + // expression reuse by the SCEVExpander), but resort to per-exit + // evaluation if that fails. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L) || - !isSafeToExpand(ExitValue, *SE)) - continue; - + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) { + // TODO: This should probably be sunk into SCEV in some way; maybe a + // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for + // most SCEV expressions and other recurrence types (e.g. shift + // recurrences). Is there existing code we can reuse? + const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i)); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst))) + if (AddRec->getLoop() == L) + ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE); + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) + continue; + } + // Computing the value outside of the loop brings no benefit if it is // definitely used inside the loop in a way which can not be optimized // away. Avoid doing so unless we know we have a value which computes @@ -804,7 +831,7 @@ bool IndVarSimplify::canLoopBeDeleted( L->getExitingBlocks(ExitingBlocks); SmallVector<BasicBlock *, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); - if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1) + if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1) return false; BasicBlock *ExitBlock = ExitBlocks[0]; @@ -1654,6 +1681,10 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { return nullptr; } + // if we reached this point then we are going to replace + // DU.NarrowUse with WideUse. Reattach DbgValue then. + replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT); + ExtendKindMap[DU.NarrowUse] = WideAddRec.second; // Returning WideUse pushes it on the worklist. return WideUse; @@ -1779,14 +1810,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { DeadInsts.emplace_back(DU.NarrowDef); } - // Attach any debug information to the new PHI. Since OrigPhi and WidePHI - // evaluate the same recurrence, we can just copy the debug info over. - SmallVector<DbgValueInst *, 1> DbgValues; - llvm::findDbgValues(DbgValues, OrigPhi); - auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(), - ValueAsMetadata::get(WidePhi)); - for (auto &DbgValue : DbgValues) - DbgValue->setOperand(0, MDPhi); + // Attach any debug information to the new PHI. + replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT); + return WidePhi; } @@ -1817,8 +1843,8 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef, auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS)); auto CmpConstrainedLHSRange = ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange); - auto NarrowDefRange = - CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS); + auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap( + *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap); updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange); }; @@ -2242,8 +2268,8 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy()) continue; - const auto *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); - + const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); + // AR may be a pointer type, while BECount is an integer type. // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. @@ -2624,74 +2650,125 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { return MadeAnyChanges; } -bool IndVarSimplify::optimizeLoopExits(Loop *L) { +/// Return a symbolic upper bound for the backedge taken count of the loop. +/// This is more general than getConstantMaxBackedgeTakenCount as it returns +/// an arbitrary expression as opposed to only constants. +/// TODO: Move into the ScalarEvolution class. +static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE, + DominatorTree &DT, Loop *L) { SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); // Form an expression for the maximum exit count possible for this loop. We // merge the max and exact information to approximate a version of - // getMaxBackedgeTakenInfo which isn't restricted to just constants. - // TODO: factor this out as a version of getMaxBackedgeTakenCount which - // isn't guaranteed to return a constant. + // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. SmallVector<const SCEV*, 4> ExitCounts; - const SCEV *MaxConstEC = SE->getMaxBackedgeTakenCount(L); + const SCEV *MaxConstEC = SE.getConstantMaxBackedgeTakenCount(L); if (!isa<SCEVCouldNotCompute>(MaxConstEC)) ExitCounts.push_back(MaxConstEC); for (BasicBlock *ExitingBB : ExitingBlocks) { - const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); if (!isa<SCEVCouldNotCompute>(ExitCount)) { - assert(DT->dominates(ExitingBB, L->getLoopLatch()) && + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && "We should only have known counts for exiting blocks that " "dominate latch!"); ExitCounts.push_back(ExitCount); } } if (ExitCounts.empty()) - return false; - const SCEV *MaxExitCount = SE->getUMinFromMismatchedTypes(ExitCounts); + return SE.getCouldNotCompute(); + return SE.getUMinFromMismatchedTypes(ExitCounts); +} - bool Changed = false; - for (BasicBlock *ExitingBB : ExitingBlocks) { +bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { + SmallVector<BasicBlock*, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Remove all exits which aren't both rewriteable and analyzeable. + auto NewEnd = llvm::remove_if(ExitingBlocks, + [&](BasicBlock *ExitingBB) { // If our exitting block exits multiple loops, we can only rewrite the // innermost one. Otherwise, we're changing how many times the innermost // loop runs before it exits. if (LI->getLoopFor(ExitingBB) != L) - continue; + return true; // Can't rewrite non-branch yet. BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); if (!BI) - continue; + return true; // If already constant, nothing to do. if (isa<Constant>(BI->getCondition())) - continue; + return true; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); if (isa<SCEVCouldNotCompute>(ExitCount)) - continue; + return true; + return false; + }); + ExitingBlocks.erase(NewEnd, ExitingBlocks.end()); + + if (ExitingBlocks.empty()) + return false; + + // Get a symbolic upper bound on the loop backedge taken count. + const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L); + if (isa<SCEVCouldNotCompute>(MaxExitCount)) + return false; + + // Visit our exit blocks in order of dominance. We know from the fact that + // all exits (left) are analyzeable that the must be a total dominance order + // between them as each must dominate the latch. The visit order only + // matters for the provably equal case. + llvm::sort(ExitingBlocks, + [&](BasicBlock *A, BasicBlock *B) { + // std::sort sorts in ascending order, so we want the inverse of + // the normal dominance relation. + if (DT->properlyDominates(A, B)) return true; + if (DT->properlyDominates(B, A)) return false; + llvm_unreachable("expected total dominance order!"); + }); +#ifdef ASSERT + for (unsigned i = 1; i < ExitingBlocks.size(); i++) { + assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); + } +#endif + + auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) { + BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); + bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); + auto *OldCond = BI->getCondition(); + auto *NewCond = ConstantInt::get(OldCond->getType(), + IsTaken ? ExitIfTrue : !ExitIfTrue); + BI->setCondition(NewCond); + if (OldCond->use_empty()) + DeadInsts.push_back(OldCond); + }; + bool Changed = false; + SmallSet<const SCEV*, 8> DominatingExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above"); + // If we know we'd exit on the first iteration, rewrite the exit to // reflect this. This does not imply the loop must exit through this // exit; there may be an earlier one taken on the first iteration. // TODO: Given we know the backedge can't be taken, we should go ahead // and break it. Or at least, kill all the header phis and simplify. if (ExitCount->isZero()) { - bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); - auto *OldCond = BI->getCondition(); - auto *NewCond = ExitIfTrue ? ConstantInt::getTrue(OldCond->getType()) : - ConstantInt::getFalse(OldCond->getType()); - BI->setCondition(NewCond); - if (OldCond->use_empty()) - DeadInsts.push_back(OldCond); + FoldExit(ExitingBB, true); Changed = true; continue; } - // If we end up with a pointer exit count, bail. + // If we end up with a pointer exit count, bail. Note that we can end up + // with a pointer exit count for one exiting block, and not for another in + // the same loop. if (!ExitCount->getType()->isIntegerTy() || !MaxExitCount->getType()->isIntegerTy()) - return false; + continue; Type *WiderType = SE->getWiderType(MaxExitCount->getType(), ExitCount->getType()); @@ -2700,35 +2777,198 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L) { assert(MaxExitCount->getType() == ExitCount->getType()); // Can we prove that some other exit must be taken strictly before this - // one? TODO: handle cases where ule is known, and equality is covered - // by a dominating exit + // one? if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxExitCount, ExitCount)) { - bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); - auto *OldCond = BI->getCondition(); - auto *NewCond = ExitIfTrue ? ConstantInt::getFalse(OldCond->getType()) : - ConstantInt::getTrue(OldCond->getType()); - BI->setCondition(NewCond); - if (OldCond->use_empty()) - DeadInsts.push_back(OldCond); + FoldExit(ExitingBB, false); Changed = true; continue; } - // TODO: If we can prove that the exiting iteration is equal to the exit - // count for this exit and that no previous exit oppurtunities exist within - // the loop, then we can discharge all other exits. (May fall out of - // previous TODO.) - - // TODO: If we can't prove any relation between our exit count and the - // loops exit count, but taking this exit doesn't require actually running - // the loop (i.e. no side effects, no computed values used in exit), then - // we can replace the exit test with a loop invariant test which exits on - // the first iteration. + // As we run, keep track of which exit counts we've encountered. If we + // find a duplicate, we've found an exit which would have exited on the + // exiting iteration, but (from the visit order) strictly follows another + // which does the same and is thus dead. + if (!DominatingExitCounts.insert(ExitCount).second) { + FoldExit(ExitingBB, false); + Changed = true; + continue; + } + + // TODO: There might be another oppurtunity to leverage SCEV's reasoning + // here. If we kept track of the min of dominanting exits so far, we could + // discharge exits with EC >= MDEC. This is less powerful than the existing + // transform (since later exits aren't considered), but potentially more + // powerful for any case where SCEV can prove a >=u b, but neither a == b + // or a >u b. Such a case is not currently known. } return Changed; } +bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { + SmallVector<BasicBlock*, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + bool Changed = false; + + // Finally, see if we can rewrite our exit conditions into a loop invariant + // form. If we have a read-only loop, and we can tell that we must exit down + // a path which does not need any of the values computed within the loop, we + // can rewrite the loop to exit on the first iteration. Note that this + // doesn't either a) tell us the loop exits on the first iteration (unless + // *all* exits are predicateable) or b) tell us *which* exit might be taken. + // This transformation looks a lot like a restricted form of dead loop + // elimination, but restricted to read-only loops and without neccesssarily + // needing to kill the loop entirely. + if (!LoopPredication) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + // Note: ExactBTC is the exact backedge taken count *iff* the loop exits + // through *explicit* control flow. We have to eliminate the possibility of + // implicit exits (see below) before we know it's truly exact. + const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(ExactBTC) || + !SE->isLoopInvariant(ExactBTC, L) || + !isSafeToExpand(ExactBTC, *SE)) + return Changed; + + auto BadExit = [&](BasicBlock *ExitingBB) { + // If our exiting block exits multiple loops, we can only rewrite the + // innermost one. Otherwise, we're changing how many times the innermost + // loop runs before it exits. + if (LI->getLoopFor(ExitingBB) != L) + return true; + + // Can't rewrite non-branch yet. + BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + return true; + + // If already constant, nothing to do. + if (isa<Constant>(BI->getCondition())) + return true; + + // If the exit block has phis, we need to be able to compute the values + // within the loop which contains them. This assumes trivially lcssa phis + // have already been removed; TODO: generalize + BasicBlock *ExitBlock = + BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); + if (!ExitBlock->phis().empty()) + return true; + + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count"); + if (!SE->isLoopInvariant(ExitCount, L) || + !isSafeToExpand(ExitCount, *SE)) + return true; + + return false; + }; + + // If we have any exits which can't be predicated themselves, than we can't + // predicate any exit which isn't guaranteed to execute before it. Consider + // two exits (a) and (b) which would both exit on the same iteration. If we + // can predicate (b), but not (a), and (a) preceeds (b) along some path, then + // we could convert a loop from exiting through (a) to one exiting through + // (b). Note that this problem exists only for exits with the same exit + // count, and we could be more aggressive when exit counts are known inequal. + llvm::sort(ExitingBlocks, + [&](BasicBlock *A, BasicBlock *B) { + // std::sort sorts in ascending order, so we want the inverse of + // the normal dominance relation, plus a tie breaker for blocks + // unordered by dominance. + if (DT->properlyDominates(A, B)) return true; + if (DT->properlyDominates(B, A)) return false; + return A->getName() < B->getName(); + }); + // Check to see if our exit blocks are a total order (i.e. a linear chain of + // exits before the backedge). If they aren't, reasoning about reachability + // is complicated and we choose not to for now. + for (unsigned i = 1; i < ExitingBlocks.size(); i++) + if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) + return Changed; + + // Given our sorted total order, we know that exit[j] must be evaluated + // after all exit[i] such j > i. + for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) + if (BadExit(ExitingBlocks[i])) { + ExitingBlocks.resize(i); + break; + } + + if (ExitingBlocks.empty()) + return Changed; + + // We rely on not being able to reach an exiting block on a later iteration + // then it's statically compute exit count. The implementaton of + // getExitCount currently has this invariant, but assert it here so that + // breakage is obvious if this ever changes.. + assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { + return DT->dominates(ExitingBB, L->getLoopLatch()); + })); + + // At this point, ExitingBlocks consists of only those blocks which are + // predicatable. Given that, we know we have at least one exit we can + // predicate if the loop is doesn't have side effects and doesn't have any + // implicit exits (because then our exact BTC isn't actually exact). + // @Reviewers - As structured, this is O(I^2) for loop nests. Any + // suggestions on how to improve this? I can obviously bail out for outer + // loops, but that seems less than ideal. MemorySSA can find memory writes, + // is that enough for *all* side effects? + for (BasicBlock *BB : L->blocks()) + for (auto &I : *BB) + // TODO:isGuaranteedToTransfer + if (I.mayHaveSideEffects() || I.mayThrow()) + return Changed; + + // Finally, do the actual predication for all predicatable blocks. A couple + // of notes here: + // 1) We don't bother to constant fold dominated exits with identical exit + // counts; that's simply a form of CSE/equality propagation and we leave + // it for dedicated passes. + // 2) We insert the comparison at the branch. Hoisting introduces additional + // legality constraints and we leave that to dedicated logic. We want to + // predicate even if we can't insert a loop invariant expression as + // peeling or unrolling will likely reduce the cost of the otherwise loop + // varying check. + Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + Value *ExactBTCV = nullptr; //lazy generated if needed + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + + auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); + Value *NewCond; + if (ExitCount == ExactBTC) { + NewCond = L->contains(BI->getSuccessor(0)) ? + B.getFalse() : B.getTrue(); + } else { + Value *ECV = Rewriter.expandCodeFor(ExitCount); + if (!ExactBTCV) + ExactBTCV = Rewriter.expandCodeFor(ExactBTC); + Value *RHS = ExactBTCV; + if (ECV->getType() != RHS->getType()) { + Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); + ECV = B.CreateZExt(ECV, WiderTy); + RHS = B.CreateZExt(RHS, WiderTy); + } + auto Pred = L->contains(BI->getSuccessor(0)) ? + ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + NewCond = B.CreateICmp(Pred, ECV, RHS); + } + Value *OldCond = BI->getCondition(); + BI->setCondition(NewCond); + if (OldCond->use_empty()) + DeadInsts.push_back(OldCond); + Changed = true; + } + + return Changed; +} + //===----------------------------------------------------------------------===// // IndVarSimplify driver. Manage several subpasses of IV simplification. //===----------------------------------------------------------------------===// @@ -2755,7 +2995,10 @@ bool IndVarSimplify::run(Loop *L) { // transform them to use integer recurrences. Changed |= rewriteNonIntegerIVs(L); +#ifndef NDEBUG + // Used below for a consistency check only const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); +#endif // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, DL, "indvars"); @@ -2772,20 +3015,22 @@ bool IndVarSimplify::run(Loop *L) { Rewriter.disableCanonicalMode(); Changed |= simplifyAndExtend(L, Rewriter, LI); - // Check to see if this loop has a computable loop-invariant execution count. - // If so, this means that we can compute the final value of any expressions + // Check to see if we can compute the final value of any expressions // that are recurrent in the loop, and substitute the exit values from the - // loop into any instructions outside of the loop that use the final values of - // the current expressions. - // - if (ReplaceExitValue != NeverRepl && - !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + // loop into any instructions outside of the loop that use the final values + // of the current expressions. + if (ReplaceExitValue != NeverRepl) Changed |= rewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV cycles. NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); - Changed |= optimizeLoopExits(L); + // Try to eliminate loop exits based on analyzeable exit counts + Changed |= optimizeLoopExits(L, Rewriter); + + // Try to form loop invariant tests for loop exits by changing how many + // iterations of the loop run when that is unobservable. + Changed |= predicateLoopExits(L, Rewriter); // If we have a trip count expression, rewrite the loop's exit condition // using it. @@ -2825,7 +3070,7 @@ bool IndVarSimplify::run(Loop *L) { // that our definition of "high cost" is not exactly principled. if (Rewriter.isHighCostExpansion(ExitCount, L)) continue; - + // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead // any pass that uses the SCEVExpander must do it. This does not work @@ -2924,7 +3169,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass { auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp index 5f0e2001c73d..e7e73a132fbe 100644 --- a/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -141,6 +141,8 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { + const TargetTransformInfo *TTI; + /// Target specific address space which uses of should be replaced if /// possible. unsigned FlatAddrSpace; @@ -264,17 +266,6 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, Module *M = II->getParent()->getParent()->getParent(); switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { - const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4)); - if (!IsVolatile || !IsVolatile->isZero()) - return false; - - LLVM_FALLTHROUGH; - } case Intrinsic::objectsize: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); @@ -285,25 +276,27 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, return true; } default: - return false; + return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); } } -// TODO: Move logic to TTI? void InferAddressSpaces::collectRewritableIntrinsicOperands( IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack, DenseSet<Value *> &Visited) const { - switch (II->getIntrinsicID()) { + auto IID = II->getIntrinsicID(); + switch (IID) { case Intrinsic::objectsize: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; default: + SmallVector<int, 2> OpIndexes; + if (TTI->collectFlatAddressOperands(OpIndexes, IID)) { + for (int Idx : OpIndexes) { + appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx), + PostorderStack, Visited); + } + } break; } } @@ -631,11 +624,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) { if (skipFunction(F)) return false; - const TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); if (FlatAddrSpace == UninitializedAddressSpace) { - FlatAddrSpace = TTI.getFlatAddressSpace(); + FlatAddrSpace = TTI->getFlatAddressSpace(); if (FlatAddrSpace == UninitializedAddressSpace) return false; } @@ -650,7 +642,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) { // Changes the address spaces of the flat address expressions who are inferred // to point to a specific address space. - return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F); + return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F); } // Constants need to be tracked through RAUW to handle cases with nested diff --git a/lib/Transforms/Scalar/InstSimplifyPass.cpp b/lib/Transforms/Scalar/InstSimplifyPass.cpp index 6616364ab203..ec28f790f252 100644 --- a/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -33,37 +33,39 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ, bool Changed = false; do { - for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { - // Here be subtlety: the iterator must be incremented before the loop - // body (not sure why), so a range-for loop won't work here. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = &*BI++; - // The first time through the loop ToSimplify is empty and we try to - // simplify all instructions. On later iterations ToSimplify is not + for (BasicBlock &BB : F) { + // Unreachable code can take on strange forms that we are not prepared to + // handle. For example, an instruction may have itself as an operand. + if (!SQ.DT->isReachableFromEntry(&BB)) + continue; + + SmallVector<Instruction *, 8> DeadInstsInBB; + for (Instruction &I : BB) { + // The first time through the loop, ToSimplify is empty and we try to + // simplify all instructions. On later iterations, ToSimplify is not // empty and we only bother simplifying instructions that are in it. - if (!ToSimplify->empty() && !ToSimplify->count(I)) + if (!ToSimplify->empty() && !ToSimplify->count(&I)) continue; - // Don't waste time simplifying unused instructions. - if (!I->use_empty()) { - if (Value *V = SimplifyInstruction(I, SQ, ORE)) { + // Don't waste time simplifying dead/unused instructions. + if (isInstructionTriviallyDead(&I)) { + DeadInstsInBB.push_back(&I); + Changed = true; + } else if (!I.use_empty()) { + if (Value *V = SimplifyInstruction(&I, SQ, ORE)) { // Mark all uses for resimplification next time round the loop. - for (User *U : I->users()) + for (User *U : I.users()) Next->insert(cast<Instruction>(U)); - I->replaceAllUsesWith(V); + I.replaceAllUsesWith(V); ++NumSimplified; Changed = true; + // A call can get simplified, but it may not be trivially dead. + if (isInstructionTriviallyDead(&I)) + DeadInstsInBB.push_back(&I); } } - if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) { - // RecursivelyDeleteTriviallyDeadInstruction can remove more than one - // instruction, so simply incrementing the iterator does not work. - // When instructions get deleted re-iterate instead. - BI = BB->begin(); - BE = BB->end(); - Changed = true; - } } + RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI); } // Place the list of instructions to simplify on the next loop iteration @@ -90,7 +92,7 @@ struct InstSimplifyLegacyPass : public FunctionPass { AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } - /// runOnFunction - Remove instructions that simplify. + /// Remove instructions that simplify. bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; @@ -98,7 +100,7 @@ struct InstSimplifyLegacyPass : public FunctionPass { const DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); AssumptionCache *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); OptimizationRemarkEmitter *ORE = diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index b86bf2fefbe5..0cf00baaa24a 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -224,13 +224,21 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> { auto *PredBB = IncomingBB; auto *SuccBB = PhiBB; + SmallPtrSet<BasicBlock *, 16> Visited; while (true) { BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); if (PredBr && PredBr->isConditional()) return {PredBB, SuccBB}; + Visited.insert(PredBB); auto *SinglePredBB = PredBB->getSinglePredecessor(); if (!SinglePredBB) return {nullptr, nullptr}; + + // Stop searching when SinglePredBB has been visited. It means we see + // an unreachable loop. + if (Visited.count(SinglePredBB)) + return {nullptr, nullptr}; + SuccBB = PredBB; PredBB = SinglePredBB; } @@ -253,7 +261,9 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { return; BasicBlock *PredBB = PredOutEdge.first; - BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator()); + BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); + if (!PredBr) + return; uint64_t PredTrueWeight, PredFalseWeight; // FIXME: We currently only set the profile data when it is missing. @@ -286,7 +296,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { bool JumpThreading::runOnFunction(Function &F) { if (skipFunction(F)) return false; - auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); // Get DT analysis before LVI. When LVI is initialized it conditionally adds // DT if it's available. auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -1461,7 +1471,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { "Can't handle critical edge here!"); LoadInst *NewVal = new LoadInst( LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), - LoadI->getName() + ".pr", false, LoadI->getAlignment(), + LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()), LoadI->getOrdering(), LoadI->getSyncScopeID(), UnavailablePred->getTerminator()); NewVal->setDebugLoc(LoadI->getDebugLoc()); @@ -2423,7 +2433,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, // |----- // v // BB - BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); + BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator()); BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", BB->getParent(), BB); // Move the unconditional branch to NewBB. diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index d9dda4cef2d2..6ce4831a7359 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -220,7 +220,8 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis<AAResultsWrapperPass>().getAAResults(), &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()), &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()), SE ? &SE->getSE() : nullptr, MSSA, &ORE, false); @@ -294,7 +295,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, PA.preserve<DominatorTreeAnalysis>(); PA.preserve<LoopAnalysis>(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; @@ -330,6 +331,12 @@ bool LoopInvariantCodeMotion::runOnLoop( assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); + // If this loop has metadata indicating that LICM is not to be performed then + // just exit. + if (hasDisableLICMTransformsHint(L)) { + return false; + } + std::unique_ptr<AliasSetTracker> CurAST; std::unique_ptr<MemorySSAUpdater> MSSAU; bool NoOfMemAccTooLarge = false; @@ -340,7 +347,7 @@ bool LoopInvariantCodeMotion::runOnLoop( CurAST = collectAliasInfoForLoop(L, LI, AA); } else { LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n"); - MSSAU = make_unique<MemorySSAUpdater>(MSSA); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); unsigned AccessCapCount = 0; for (auto *BB : L->getBlocks()) { @@ -956,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // Now that we've finished hoisting make sure that LI and DT are still // valid. -#ifndef NDEBUG +#ifdef EXPENSIVE_CHECKS if (Changed) { assert(DT->verify(DominatorTree::VerificationLevel::Fast) && "Dominator tree verification failed"); @@ -1026,7 +1033,8 @@ namespace { bool isHoistableAndSinkableInst(Instruction &I) { // Only these instructions are hoistable/sinkable. return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) || - isa<FenceInst>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) || + isa<FenceInst>(I) || isa<CastInst>(I) || + isa<UnaryOperator>(I) || isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) || @@ -1092,7 +1100,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // in the same alias set as something that ends up being modified. if (AA->pointsToConstantMemory(LI->getOperand(0))) return true; - if (LI->getMetadata(LLVMContext::MD_invariant_load)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; if (LI->isAtomic() && !TargetExecutesOncePerLoop) @@ -1240,12 +1248,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // FIXME: More precise: no Uses that alias SI. if (!Flags->IsSink && !MSSA->dominates(SIMD, MU)) return false; - } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) + } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) { if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) { (void)LI; // Silence warning. assert(!LI->isUnordered() && "Expected unordered load"); return false; } + // Any call, while it may not be clobbering SI, it may be a use. + if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) { + // Check if the call may read from the memory locattion written + // to by SI. Check CI's attributes and arguments; the number of + // such checks performed is limited above by NoOfMemAccTooLarge. + ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); + if (isModOrRefSet(MRI)) + return false; + } + } } auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); @@ -1375,8 +1393,7 @@ static Instruction *CloneInstructionInExitBlock( if (!I.getName().empty()) New->setName(I.getName() + ".le"); - MemoryAccess *OldMemAcc; - if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) { + if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) { // Create a new MemoryAccess and let MemorySSA set its defining access. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( New, nullptr, New->getParent(), MemorySSA::Beginning); @@ -1385,7 +1402,7 @@ static Instruction *CloneInstructionInExitBlock( MSSAU->insertDef(MemDef, /*RenameUses=*/true); else { auto *MemUse = cast<MemoryUse>(NewMemAcc); - MSSAU->insertUse(MemUse); + MSSAU->insertUse(MemUse, /*RenameUses=*/true); } } } @@ -1783,7 +1800,7 @@ public: StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Alignment); + NewSI->setAlignment(MaybeAlign(Alignment)); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); @@ -2016,7 +2033,8 @@ bool llvm::promoteLoopAccessesToScalars( if (!DereferenceableInPH) { DereferenceableInPH = isDereferenceableAndAlignedPointer( Store->getPointerOperand(), Store->getValueOperand()->getType(), - Store->getAlignment(), MDL, Preheader->getTerminator(), DT); + MaybeAlign(Store->getAlignment()), MDL, + Preheader->getTerminator(), DT); } } else return false; // Not a load or store. @@ -2101,20 +2119,21 @@ bool llvm::promoteLoopAccessesToScalars( SomePtr->getName() + ".promoted", Preheader->getTerminator()); if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); - PreheaderLoad->setAlignment(Alignment); + PreheaderLoad->setAlignment(MaybeAlign(Alignment)); PreheaderLoad->setDebugLoc(DL); if (AATags) PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); - MemoryAccess *PreheaderLoadMemoryAccess; if (MSSAU) { - PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( + MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); - MSSAU->insertUse(NewMemUse); + MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); // Rewrite all the loads in the loop and remember all the definitions from // stores in the loop. Promoter.run(LoopUses); @@ -2161,7 +2180,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, LoopToAliasSetMap.erase(MapI); } if (!CurAST) - CurAST = make_unique<AliasSetTracker>(*AA); + CurAST = std::make_unique<AliasSetTracker>(*AA); // Add everything from the sub loops that are no longer directly available. for (Loop *InnerL : RecomputeLoops) @@ -2180,7 +2199,7 @@ std::unique_ptr<AliasSetTracker> LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA( Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) { auto *MSSA = MSSAU->getMemorySSA(); - auto CurAST = make_unique<AliasSetTracker>(*AA, MSSA, L); + auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L); CurAST->addAllInstructionsInLoopUsingMSSA(); return CurAST; } diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 1fcf1315a177..a972d6fa2fcd 100644 --- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -312,8 +312,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { IRBuilder<> Builder(MemI); Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = - Intrinsic::getDeclaration(M, Intrinsic::prefetch); + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, PrefPtrValue->getType()); Builder.CreateCall( PrefetchFunc, {PrefPtrValue, diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 8371367e24e7..cee197cf8354 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -191,7 +191,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - const SCEV *S = SE.getMaxBackedgeTakenCount(L); + const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) { LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); return Changed ? LoopDeletionResult::Modified diff --git a/lib/Transforms/Scalar/LoopFuse.cpp b/lib/Transforms/Scalar/LoopFuse.cpp index 0bc2bcff2ae1..9f93c68e6128 100644 --- a/lib/Transforms/Scalar/LoopFuse.cpp +++ b/lib/Transforms/Scalar/LoopFuse.cpp @@ -66,7 +66,7 @@ using namespace llvm; #define DEBUG_TYPE "loop-fusion" -STATISTIC(FuseCounter, "Count number of loop fusions performed"); +STATISTIC(FuseCounter, "Loops fused"); STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); STATISTIC(InvalidPreheader, "Loop has invalid preheader"); STATISTIC(InvalidHeader, "Loop has invalid header"); @@ -79,12 +79,15 @@ STATISTIC(MayThrowException, "Loop may throw an exception"); STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); -STATISTIC(InvalidTripCount, - "Loop does not have invariant backedge taken count"); +STATISTIC(UnknownTripCount, "Loop has unknown trip count"); STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); -STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same"); -STATISTIC(NonAdjacent, "Candidates are not adjacent"); -STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader"); +STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); +STATISTIC(NonAdjacent, "Loops are not adjacent"); +STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); +STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); +STATISTIC(NonIdenticalGuards, "Candidates have different guards"); +STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); +STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -110,6 +113,7 @@ static cl::opt<bool> cl::Hidden, cl::init(false), cl::ZeroOrMore); #endif +namespace { /// This class is used to represent a candidate for loop fusion. When it is /// constructed, it checks the conditions for loop fusion to ensure that it /// represents a valid candidate. It caches several parts of a loop that are @@ -143,6 +147,8 @@ struct FusionCandidate { SmallVector<Instruction *, 16> MemWrites; /// Are all of the members of this fusion candidate still valid bool Valid; + /// Guard branch of the loop, if it exists + BranchInst *GuardBranch; /// Dominator and PostDominator trees are needed for the /// FusionCandidateCompare function, required by FusionCandidateSet to @@ -151,11 +157,20 @@ struct FusionCandidate { const DominatorTree *DT; const PostDominatorTree *PDT; + OptimizationRemarkEmitter &ORE; + FusionCandidate(Loop *L, const DominatorTree *DT, - const PostDominatorTree *PDT) + const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), - Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) { + Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr), + DT(DT), PDT(PDT), ORE(ORE) { + + // TODO: This is temporary while we fuse both rotated and non-rotated + // loops. Once we switch to only fusing rotated loops, the initialization of + // GuardBranch can be moved into the initialization list above. + if (isRotated()) + GuardBranch = L->getLoopGuardBranch(); // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -163,28 +178,28 @@ struct FusionCandidate { // found, invalidate this object and return. for (BasicBlock *BB : L->blocks()) { if (BB->hasAddressTaken()) { - AddressTakenBB++; invalidate(); + reportInvalidCandidate(AddressTakenBB); return; } for (Instruction &I : *BB) { if (I.mayThrow()) { - MayThrowException++; invalidate(); + reportInvalidCandidate(MayThrowException); return; } if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (SI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (LI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } @@ -214,19 +229,96 @@ struct FusionCandidate { assert(Latch == L->getLoopLatch() && "Latch is out of sync"); } + /// Get the entry block for this fusion candidate. + /// + /// If this fusion candidate represents a guarded loop, the entry block is the + /// loop guard block. If it represents an unguarded loop, the entry block is + /// the preheader of the loop. + BasicBlock *getEntryBlock() const { + if (GuardBranch) + return GuardBranch->getParent(); + else + return Preheader; + } + + /// Given a guarded loop, get the successor of the guard that is not in the + /// loop. + /// + /// This method returns the successor of the loop guard that is not located + /// within the loop (i.e., the successor of the guard that is not the + /// preheader). + /// This method is only valid for guarded loops. + BasicBlock *getNonLoopBlock() const { + assert(GuardBranch && "Only valid on guarded loops."); + assert(GuardBranch->isConditional() && + "Expecting guard to be a conditional branch."); + return (GuardBranch->getSuccessor(0) == Preheader) + ? GuardBranch->getSuccessor(1) + : GuardBranch->getSuccessor(0); + } + + bool isRotated() const { + assert(L && "Expecting loop to be valid."); + assert(Latch && "Expecting latch to be valid."); + return L->isLoopExiting(Latch); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const { - dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") + dbgs() << "\tGuardBranch: " + << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" + << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") << "\n" << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" << "\tExitingBB: " << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") << "\n" - << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"; + << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" + << "\tEntryBlock: " + << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") + << "\n"; } #endif + /// Determine if a fusion candidate (representing a loop) is eligible for + /// fusion. Note that this only checks whether a single loop can be fused - it + /// does not check whether it is *legal* to fuse two loops together. + bool isEligibleForFusion(ScalarEvolution &SE) const { + if (!isValid()) { + LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); + if (!Preheader) + ++InvalidPreheader; + if (!Header) + ++InvalidHeader; + if (!ExitingBlock) + ++InvalidExitingBlock; + if (!ExitBlock) + ++InvalidExitBlock; + if (!Latch) + ++InvalidLatch; + if (L->isInvalid()) + ++InvalidLoop; + + return false; + } + + // Require ScalarEvolution to be able to determine a trip count. + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " trip count not computable!\n"); + return reportInvalidCandidate(UnknownTripCount); + } + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " is not in simplified form!\n"); + return reportInvalidCandidate(NotSimplifiedForm); + } + + return true; + } + private: // This is only used internally for now, to clear the MemWrites and MemReads // list and setting Valid to false. I can't envision other uses of this right @@ -239,17 +331,18 @@ private: MemReads.clear(); Valid = false; } -}; -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { - if (FC.isValid()) - OS << FC.Preheader->getName(); - else - OS << "<Invalid>"; - - return OS; -} + bool reportInvalidCandidate(llvm::Statistic &Stat) const { + using namespace ore; + assert(L && Preheader && "Fusion candidate not initialized properly!"); + ++Stat; + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), + L->getStartLoc(), Preheader) + << "[" << Preheader->getParent()->getName() << "]: " + << "Loop is not a candidate for fusion: " << Stat.getDesc()); + return false; + } +}; struct FusionCandidateCompare { /// Comparison functor to sort two Control Flow Equivalent fusion candidates @@ -260,21 +353,24 @@ struct FusionCandidateCompare { const FusionCandidate &RHS) const { const DominatorTree *DT = LHS.DT; + BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); + BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); + // Do not save PDT to local variable as it is only used in asserts and thus // will trigger an unused variable warning if building without asserts. assert(DT && LHS.PDT && "Expecting valid dominator tree"); // Do this compare first so if LHS == RHS, function returns false. - if (DT->dominates(RHS.Preheader, LHS.Preheader)) { + if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { // RHS dominates LHS // Verify LHS post-dominates RHS - assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader)); + assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); return false; } - if (DT->dominates(LHS.Preheader, RHS.Preheader)) { + if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { // Verify RHS Postdominates LHS - assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader)); + assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); return true; } @@ -286,7 +382,6 @@ struct FusionCandidateCompare { } }; -namespace { using LoopVector = SmallVector<Loop *, 4>; // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance @@ -301,17 +396,26 @@ using LoopVector = SmallVector<Loop *, 4>; // keeps the FusionCandidateSet sorted will also simplify the implementation. using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -} // namespace -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, +#if !defined(NDEBUG) +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const FusionCandidate &FC) { + if (FC.isValid()) + OS << FC.Preheader->getName(); + else + OS << "<Invalid>"; + + return OS; +} + +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FusionCandidateSet &CandSet) { - for (auto IT : CandSet) - OS << IT << "\n"; + for (const FusionCandidate &FC : CandSet) + OS << FC << '\n'; return OS; } -#if !defined(NDEBUG) static void printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "Fusion Candidates: \n"; @@ -391,16 +495,6 @@ static void printLoopVector(const LoopVector &LV) { } #endif -static void reportLoopFusion(const FusionCandidate &FC0, - const FusionCandidate &FC1, - OptimizationRemarkEmitter &ORE) { - using namespace ore; - ORE.emit( - OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent()) - << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName())) - << " with " << NV("Cand2", StringRef(FC1.Preheader->getName()))); -} - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -497,53 +591,16 @@ private: const FusionCandidate &FC1) const { assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); - if (DT.dominates(FC0.Preheader, FC1.Preheader)) - return PDT.dominates(FC1.Preheader, FC0.Preheader); + BasicBlock *FC0EntryBlock = FC0.getEntryBlock(); + BasicBlock *FC1EntryBlock = FC1.getEntryBlock(); - if (DT.dominates(FC1.Preheader, FC0.Preheader)) - return PDT.dominates(FC0.Preheader, FC1.Preheader); + if (DT.dominates(FC0EntryBlock, FC1EntryBlock)) + return PDT.dominates(FC1EntryBlock, FC0EntryBlock); - return false; - } - - /// Determine if a fusion candidate (representing a loop) is eligible for - /// fusion. Note that this only checks whether a single loop can be fused - it - /// does not check whether it is *legal* to fuse two loops together. - bool eligibleForFusion(const FusionCandidate &FC) const { - if (!FC.isValid()) { - LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n"); - if (!FC.Preheader) - InvalidPreheader++; - if (!FC.Header) - InvalidHeader++; - if (!FC.ExitingBlock) - InvalidExitingBlock++; - if (!FC.ExitBlock) - InvalidExitBlock++; - if (!FC.Latch) - InvalidLatch++; - if (FC.L->isInvalid()) - InvalidLoop++; + if (DT.dominates(FC1EntryBlock, FC0EntryBlock)) + return PDT.dominates(FC0EntryBlock, FC1EntryBlock); - return false; - } - - // Require ScalarEvolution to be able to determine a trip count. - if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " trip count not computable!\n"); - InvalidTripCount++; - return false; - } - - if (!FC.L->isLoopSimplifyForm()) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " is not in simplified form!\n"); - NotSimplifiedForm++; - return false; - } - - return true; + return false; } /// Iterate over all loops in the given loop set and identify the loops that @@ -551,8 +608,8 @@ private: /// Flow Equivalent sets, sorted by dominance. void collectFusionCandidates(const LoopVector &LV) { for (Loop *L : LV) { - FusionCandidate CurrCand(L, &DT, &PDT); - if (!eligibleForFusion(CurrCand)) + FusionCandidate CurrCand(L, &DT, &PDT, ORE); + if (!CurrCand.isEligibleForFusion(SE)) continue; // Go through each list in FusionCandidates and determine if L is control @@ -664,31 +721,64 @@ private: if (!identicalTripCounts(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " "counts. Not fusing.\n"); - NonEqualTripCount++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEqualTripCount); continue; } if (!isAdjacent(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates are not adjacent. Not fusing.\n"); - NonAdjacent++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); continue; } - // For now we skip fusing if the second candidate has any instructions - // in the preheader. This is done because we currently do not have the - // safety checks to determine if it is save to move the preheader of - // the second candidate past the body of the first candidate. Once - // these checks are added, this condition can be removed. + // Ensure that FC0 and FC1 have identical guards. + // If one (or both) are not guarded, this check is not necessary. + if (FC0->GuardBranch && FC1->GuardBranch && + !haveIdenticalGuards(*FC0, *FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " + "guards. Not Fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonIdenticalGuards); + continue; + } + + // The following three checks look for empty blocks in FC0 and FC1. If + // any of these blocks are non-empty, we do not fuse. This is done + // because we currently do not have the safety checks to determine if + // it is safe to move the blocks past other blocks in the loop. Once + // these checks are added, these conditions can be relaxed. if (!isEmptyPreheader(*FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " "preheader. Not fusing.\n"); - NonEmptyPreheader++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + + if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyExitBlock); + continue; + } + + if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyGuardBlock); continue; } + // Check the dependencies across the loops and do not fuse if it would + // violate them. if (!dependencesAllowFusion(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + InvalidDependencies); continue; } @@ -696,9 +786,11 @@ private: LLVM_DEBUG(dbgs() << "\tFusion appears to be " << (BeneficialToFuse ? "" : "un") << "profitable!\n"); - if (!BeneficialToFuse) + if (!BeneficialToFuse) { + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + FusionNotBeneficial); continue; - + } // All analysis has completed and has determined that fusion is legal // and profitable. At this point, start transforming the code and // perform fusion. @@ -710,15 +802,14 @@ private: // Note this needs to be done *before* performFusion because // performFusion will change the original loops, making it not // possible to identify them after fusion is complete. - reportLoopFusion(*FC0, *FC1, ORE); + reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter); - FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT); + FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); FusedCand.verify(); - assert(eligibleForFusion(FusedCand) && + assert(FusedCand.isEligibleForFusion(SE) && "Fused candidate should be eligible for fusion!"); // Notify the loop-depth-tree that these loops are not valid objects - // anymore. LDT.removeLoop(FC1->L); CandidateSet.erase(FC0); @@ -889,7 +980,7 @@ private: LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 << "\n"); assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); - assert(DT.dominates(FC0.Preheader, FC1.Preheader)); + assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); for (Instruction *WriteL0 : FC0.MemWrites) { for (Instruction *WriteL1 : FC1.MemWrites) @@ -939,18 +1030,89 @@ private: return true; } - /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this - /// case, there is no code in between the two fusion candidates, thus making - /// them adjacent. + /// Determine if two fusion candidates are adjacent in the CFG. + /// + /// This method will determine if there are additional basic blocks in the CFG + /// between the exit of \p FC0 and the entry of \p FC1. + /// If the two candidates are guarded loops, then it checks whether the + /// non-loop successor of the \p FC0 guard branch is the entry block of \p + /// FC1. If not, then the loops are not adjacent. If the two candidates are + /// not guarded loops, then it checks whether the exit block of \p FC0 is the + /// preheader of \p FC1. bool isAdjacent(const FusionCandidate &FC0, const FusionCandidate &FC1) const { - return FC0.ExitBlock == FC1.Preheader; + // If the successor of the guard branch is FC1, then the loops are adjacent + if (FC0.GuardBranch) + return FC0.getNonLoopBlock() == FC1.getEntryBlock(); + else + return FC0.ExitBlock == FC1.getEntryBlock(); + } + + /// Determine if two fusion candidates have identical guards + /// + /// This method will determine if two fusion candidates have the same guards. + /// The guards are considered the same if: + /// 1. The instructions to compute the condition used in the compare are + /// identical. + /// 2. The successors of the guard have the same flow into/around the loop. + /// If the compare instructions are identical, then the first successor of the + /// guard must go to the same place (either the preheader of the loop or the + /// NonLoopBlock). In other words, the the first successor of both loops must + /// both go into the loop (i.e., the preheader) or go around the loop (i.e., + /// the NonLoopBlock). The same must be true for the second successor. + bool haveIdenticalGuards(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + assert(FC0.GuardBranch && FC1.GuardBranch && + "Expecting FC0 and FC1 to be guarded loops."); + + if (auto FC0CmpInst = + dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) + if (auto FC1CmpInst = + dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) + if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) + return false; + + // The compare instructions are identical. + // Now make sure the successor of the guards have the same flow into/around + // the loop + if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) + return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); + else + return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); + } + + /// Check that the guard for \p FC *only* contains the cmp/branch for the + /// guard. + /// Once we are able to handle intervening code, any code in the guard block + /// for FC1 will need to be treated as intervening code and checked whether + /// it can safely move around the loops. + bool isEmptyGuardBlock(const FusionCandidate &FC) const { + assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); + if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) { + auto *GuardBlock = FC.GuardBranch->getParent(); + // If the generation of the cmp value is in GuardBlock, then the size of + // the guard block should be 2 (cmp + branch). If the generation of the + // cmp value is in a different block, then the size of the guard block + // should only be 1. + if (CmpInst->getParent() == GuardBlock) + return GuardBlock->size() == 2; + else + return GuardBlock->size() == 1; + } + + return false; } bool isEmptyPreheader(const FusionCandidate &FC) const { + assert(FC.Preheader && "Expecting a valid preheader"); return FC.Preheader->size() == 1; } + bool isEmptyExitBlock(const FusionCandidate &FC) const { + assert(FC.ExitBlock && "Expecting a valid exit block"); + return FC.ExitBlock->size() == 1; + } + /// Fuse two fusion candidates, creating a new fused loop. /// /// This method contains the mechanics of fusing two loops, represented by \p @@ -987,6 +1149,12 @@ private: LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); + // Fusing guarded loops is handled slightly differently than non-guarded + // loops and has been broken out into a separate method instead of trying to + // intersperse the logic within a single method. + if (FC0.GuardBranch) + return fuseGuardedLoops(FC0, FC1); + assert(FC1.Preheader == FC0.ExitBlock); assert(FC1.Preheader->size() == 1 && FC1.Preheader->getSingleSuccessor() == FC1.Header); @@ -1131,7 +1299,258 @@ private: SE.verify(); #endif - FuseCounter++; + LLVM_DEBUG(dbgs() << "Fusion done:\n"); + + return FC0.L; + } + + /// Report details on loop fusion opportunities. + /// + /// This template function can be used to report both successful and missed + /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should + /// be one of: + /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful + /// given two valid fusion candidates. + /// - OptimizationRemark to report successful fusion of two fusion + /// candidates. + /// The remarks will be printed using the form: + /// <path/filename>:<line number>:<column number>: [<function name>]: + /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> + template <typename RemarkKind> + void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, + llvm::Statistic &Stat) { + assert(FC0.Preheader && FC1.Preheader && + "Expecting valid fusion candidates"); + using namespace ore; + ++Stat; + ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), + FC0.Preheader) + << "[" << FC0.Preheader->getParent()->getName() + << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) + << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) + << ": " << Stat.getDesc()); + } + + /// Fuse two guarded fusion candidates, creating a new fused loop. + /// + /// Fusing guarded loops is handled much the same way as fusing non-guarded + /// loops. The rewiring of the CFG is slightly different though, because of + /// the presence of the guards around the loops and the exit blocks after the + /// loop body. As such, the new loop is rewired as follows: + /// 1. Keep the guard branch from FC0 and use the non-loop block target + /// from the FC1 guard branch. + /// 2. Remove the exit block from FC0 (this exit block should be empty + /// right now). + /// 3. Remove the guard branch for FC1 + /// 4. Remove the preheader for FC1. + /// The exit block successor for the latch of FC0 is updated to be the header + /// of FC1 and the non-exit block successor of the latch of FC1 is updated to + /// be the header of FC0, thus creating the fused loop. + Loop *fuseGuardedLoops(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); + + BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); + BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); + BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); + BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + + assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); + + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + + //////////////////////////////////////////////////////////////////////////// + // Update the Loop Guard + //////////////////////////////////////////////////////////////////////////// + // The guard for FC0 is updated to guard both FC0 and FC1. This is done by + // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. + // Thus, one path from the guard goes to the preheader for FC0 (and thus + // executes the new fused loop) and the other path goes to the NonLoopBlock + // for FC1 (where FC1 guard would have gone if FC1 was not executed). + FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); + FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, + FC1.Header); + + // The guard of FC1 is not necessary anymore. + FC1.GuardBranch->eraseFromParent(); + new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); + + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "Expecting guard block to have no predecessors"); + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "Expecting guard block to have no successors"); + + // Remember the phi nodes originally in the header of FC0 in order to rewire + // them later. However, this is only necessary if the new loop carried + // values might not dominate the exiting branch. While we do not generally + // test if this is the case but simply insert intermediate phi nodes, we + // need to make sure these intermediate phi nodes have different + // predecessors. To this end, we filter the special case where the exiting + // block is the latch block of the first loop. Nothing needs to be done + // anyway as all loop carried values dominate the latch and thereby also the + // exiting branch. + // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch + // (because the loops are rotated. Thus, nothing will ever be added to + // OriginalFC0PHIs. + SmallVector<PHINode *, 8> OriginalFC0PHIs; + if (FC0.ExitingBlock != FC0.Latch) + for (PHINode &PHI : FC0.Header->phis()) + OriginalFC0PHIs.push_back(&PHI); + + assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); + + // Replace incoming blocks for header PHIs first. + FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); + FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); + + // The old exiting block of the first loop (FC0) has to jump to the header + // of the second as we need to execute the code in the second header block + // regardless of the trip count. That is, if the trip count is 0, so the + // back edge is never taken, we still have to execute both loop headers, + // especially (but not only!) if the second is a do-while style loop. + // However, doing so might invalidate the phi nodes of the first loop as + // the new values do only need to dominate their latch and not the exiting + // predicate. To remedy this potential problem we always introduce phi + // nodes in the header of the second loop later that select the loop carried + // value, if the second header was reached through an old latch of the + // first, or undef otherwise. This is sound as exiting the first implies the + // second will exit too, __without__ taking the back-edge (their + // trip-counts are equal after all). + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + + // Remove FC0 Exit Block + // The exit block for FC0 is no longer needed since control will flow + // directly to the header of FC1. Since it is an empty block, it can be + // removed at this point. + // TODO: In the future, we can handle non-empty exit blocks my merging any + // instructions from FC0 exit block into FC1 exit block prior to removing + // the block. + assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && + "Expecting exit block to be empty"); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + + // Remove FC1 Preheader + // The pre-header of L1 is not necessary anymore. + assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); + FC1.Preheader->getTerminator()->eraseFromParent(); + new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1.Preheader, FC1.Header)); + + // Moves the phi nodes from the second to the first loops header block. + while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { + if (SE.isSCEVable(PHI->getType())) + SE.forgetValue(PHI); + if (PHI->hasNUsesOrMore(1)) + PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); + else + PHI->eraseFromParent(); + } + + // Introduce new phi nodes in the second loop header to ensure + // exiting the first and jumping to the header of the second does not break + // the SSA property of the phis originally in the first loop. See also the + // comment above. + Instruction *L1HeaderIP = &FC1.Header->front(); + for (PHINode *LCPHI : OriginalFC0PHIs) { + int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); + assert(L1LatchBBIdx >= 0 && + "Expected loop carried value to be rewired at this point!"); + + Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); + + PHINode *L1HeaderPHI = PHINode::Create( + LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); + L1HeaderPHI->addIncoming(LCV, FC0.Latch); + L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + FC0.ExitingBlock); + + LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); + } + + // Update the latches + + // Replace latch terminator destinations. + FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); + FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + + // If FC0.Latch and FC0.ExitingBlock are the same then we have already + // performed the updates above. + if (FC0.Latch != FC0.ExitingBlock) + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.Latch, FC1.Header)); + + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC0.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, + FC1.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC1.Latch, FC1.Header)); + + // All done + // Apply the updates to the Dominator Tree and cleanup. + + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "FC1GuardBlock has successors!!"); + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "FC1GuardBlock has predecessors!!"); + + // Update DT/PDT + DTU.applyUpdates(TreeUpdates); + + LI.removeBlock(FC1.Preheader); + DTU.deleteBB(FC1.Preheader); + DTU.deleteBB(FC0.ExitBlock); + DTU.flush(); + + // Is there a way to keep SE up-to-date so we don't need to forget the loops + // and rebuild the information in subsequent passes of fusion? + SE.forgetLoop(FC1.L); + SE.forgetLoop(FC0.L); + + // Merge the loops. + SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), + FC1.L->block_end()); + for (BasicBlock *BB : Blocks) { + FC0.L->addBlockEntry(BB); + FC1.L->removeBlockFromLoop(BB); + if (LI.getLoopFor(BB) != FC1.L) + continue; + LI.changeLoopFor(BB, FC0.L); + } + while (!FC1.L->empty()) { + const auto &ChildLoopIt = FC1.L->begin(); + Loop *ChildLoop = *ChildLoopIt; + FC1.L->removeChildLoop(ChildLoopIt); + FC0.L->addChildLoop(ChildLoop); + } + + // Delete the now empty loop L1. + LI.erase(FC1.L); + +#ifndef NDEBUG + assert(!verifyFunction(*FC0.Header->getParent(), &errs())); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif LLVM_DEBUG(dbgs() << "Fusion done:\n"); @@ -1177,6 +1596,7 @@ struct LoopFuseLegacy : public FunctionPass { return LF.fuseLoops(F); } }; +} // namespace PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult<LoopAnalysis>(F); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index e561494f19cf..dd477e800693 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -41,6 +41,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -77,16 +78,20 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -102,6 +107,7 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); +STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare"); static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -111,6 +117,26 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( namespace { +// FIXME: reinventing the wheel much? Is there a cleaner solution? +struct PMAbstraction { + virtual void markLoopAsDeleted(Loop *L) = 0; + virtual ~PMAbstraction() = default; +}; +struct LegacyPMAbstraction : PMAbstraction { + LPPassManager &LPM; + LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {} + virtual ~LegacyPMAbstraction() = default; + void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); } +}; +struct NewPMAbstraction : PMAbstraction { + LPMUpdater &Updater; + NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {} + virtual ~NewPMAbstraction() = default; + void markLoopAsDeleted(Loop *L) override { + Updater.markLoopAsDeleted(*L, L->getName()); + } +}; + class LoopIdiomRecognize { Loop *CurLoop = nullptr; AliasAnalysis *AA; @@ -120,6 +146,7 @@ class LoopIdiomRecognize { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; + PMAbstraction &LoopDeleter; OptimizationRemarkEmitter &ORE; bool ApplyCodeSizeHeuristics; @@ -128,9 +155,10 @@ public: LoopInfo *LI, ScalarEvolution *SE, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const DataLayout *DL, + const DataLayout *DL, PMAbstraction &LoopDeleter, OptimizationRemarkEmitter &ORE) - : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {} + : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), + LoopDeleter(LoopDeleter), ORE(ORE) {} bool runOnLoop(Loop *L); @@ -144,6 +172,8 @@ private: bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; + bool HasMemCmp; + bool HasBCmp; /// Return code for isLegalStore() enum LegalStoreKind { @@ -186,6 +216,32 @@ private: bool runOnNoncountableLoop(); + struct CmpLoopStructure { + Value *BCmpValue, *LatchCmpValue; + BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB; + BasicBlock *LatchBrFinishBB, *LatchBrContinueBB; + }; + bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const; + struct CmpOfLoads { + ICmpInst::Predicate BCmpPred; + Value *LoadSrcA, *LoadSrcB; + Value *LoadA, *LoadB; + }; + bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const; + bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads, + CmpLoopStructure &CmpLoop) const; + bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads, + const SCEV *&SrcA, const SCEV *&SrcB, + const SCEV *&Iterations) const; + bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst, + LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA, + const SCEV *&SrcB, const SCEV *&NBytes) const; + BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual); + void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst, + LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA, + const SCEV *SrcB, const SCEV *NBytes); + bool recognizeBCmp(); + bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); @@ -217,18 +273,20 @@ public: LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()); const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); + LegacyPMAbstraction LoopDeleter(LPM); // For the old PM, we can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); - LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE); + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE); return LIR.runOnLoop(L); } @@ -247,7 +305,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0; PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &) { + LPMUpdater &Updater) { const auto *DL = &L.getHeader()->getModule()->getDataLayout(); const auto &FAM = @@ -261,8 +319,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached " "at a higher level"); + NewPMAbstraction LoopDeleter(Updater); LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, - *ORE); + LoopDeleter, *ORE); if (!LIR.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -299,7 +358,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + if (Name == "memset" || Name == "memcpy" || Name == "memcmp" || + Name == "bcmp") return false; // Determine if code size heuristics need to be applied. @@ -309,8 +369,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemset = TLI->has(LibFunc_memset); HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); + HasMemCmp = TLI->has(LibFunc_memcmp); + HasBCmp = TLI->has(LibFunc_bcmp); - if (HasMemset || HasMemsetPattern || HasMemcpy) + if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -961,7 +1023,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. - GV->setAlignment(16); + GV->setAlignment(Align(16)); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); } @@ -1149,7 +1211,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << "] Noncountable Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizePopcount() || recognizeAndInsertFFS(); + return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS(); } /// Check if the given conditional branch is based on the comparison between @@ -1823,3 +1885,811 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, // loop. The loop would otherwise not be deleted even if it becomes empty. SE->forgetLoop(CurLoop); } + +bool LoopIdiomRecognize::matchBCmpLoopStructure( + CmpLoopStructure &CmpLoop) const { + ICmpInst::Predicate BCmpPred; + + // We are looking for the following basic layout: + // PreheaderBB: <preheader> ; preds = ??? + // <...> + // br label %LoopHeaderBB + // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB + // <...> + // %BCmpValue = icmp <...> + // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0 + // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB + // <...> + // %LatchCmpValue = <are we done, or do next iteration?> + // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB + // Successor0: <exit> ; preds = %LoopHeaderBB + // <...> + // Successor1: <exit> ; preds = %LoopLatchBB + // <...> + // + // Successor0 and Successor1 may or may not be the same basic block. + + // Match basic frame-work of this supposedly-comparison loop. + using namespace PatternMatch; + if (!match(CurLoop->getHeader()->getTerminator(), + m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()), + m_Value(CmpLoop.BCmpValue)), + CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) || + !match(CurLoop->getLoopLatch()->getTerminator(), + m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)), + CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) { + LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n"); + return true; +} + +bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue, + CmpOfLoads &CmpOfLoads) const { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue + << " as bcmp pattern.\n"); + + // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example: + // %v0 = load <...>, <...>* %LoadSrcA + // %v1 = load <...>, <...>* %LoadSrcB + // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1 + // There won't be any no-op bitcasts between load and icmp, + // they would have been transformed into a load of bitcast. + // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too. + if (!match(BCmpValue, + m_ICmp(CmpOfLoads.BCmpPred, + m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)), + m_Value(CmpOfLoads.LoadA)), + m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)), + m_Value(CmpOfLoads.LoadB)))) || + !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) { + LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t" + << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB + << "\n"); + // FIXME: handle memcmp pattern? + return true; +} + +bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow( + const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const { + BasicBlock *LoopHeaderBB = CurLoop->getHeader(); + BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); + + // Be wary, comparisons can be inverted, canonicalize order. + // If this 'element' comparison passed, we expect to proceed to the next elt. + if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ) + std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB); + // The predicate on loop latch does not matter, just canonicalize some order. + if (CmpLoop.LatchBrContinueBB != LoopHeaderBB) + std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB); + + SmallVector<BasicBlock *, 2> ExitBlocks; + + CurLoop->getUniqueExitBlocks(ExitBlocks); + assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks."); + + // Check that control-flow between blocks is as expected. + if (CmpLoop.HeaderBrEqualBB != LoopLatchBB || + CmpLoop.LatchBrContinueBB != LoopHeaderBB || + !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) || + !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) { + LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n"); + return false; + } + + assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) && + !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) && + "Unexpected exit edges."); + + LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n"); + + LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n"); + assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here."); + // No loop instructions must be used outside of the loop. Since we are in + // LCSSA form, we only need to check successor block's PHI nodes's incoming + // values for incoming blocks that are the loop basic blocks. + for (const BasicBlock *ExitBB : ExitBlocks) { + for (const PHINode &PHI : ExitBB->phis()) { + for (const BasicBlock *LoopBB : + make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) { + return CurLoop->contains(PredecessorBB); + })) { + const auto *I = + dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB)); + if (I && CurLoop->contains(I)) { + LLVM_DEBUG(dbgs() + << "Loop contains instruction " << *I + << " which is used outside of the loop in basic block " + << ExitBB->getName() << " in phi node " << PHI << "\n"); + return false; + } + } + } + } + // Similarly, the loop should not have any other observable side-effects + // other than the final comparison result. + for (BasicBlock *LoopBB : CurLoop->blocks()) { + for (Instruction &I : *LoopBB) { + if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo. + continue; // FIXME: anything else? lifetime info? + if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) && + &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) { + LLVM_DEBUG( + dbgs() << "Loop contains instruction with potential side-effects: " + << I << "\n"); + return false; + } + } + } + LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n"); + return true; +} + +bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, + CmpOfLoads &CmpOfLoads, + const SCEV *&SrcA, + const SCEV *&SrcB, + const SCEV *&Iterations) const { + // Try to compute SCEV of the loads, for this loop's scope. + const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>( + SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop)); + const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>( + SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop)); + if (!ScevForSrcA || !ScevForSrcB) { + LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t" + << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n"); + + // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB> + const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE); + const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE); + if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() || + ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop || + RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) || + cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) { + LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support " + "affine SCEV expressions originating in the loop we " + "are analysing with identical constant positive step, " + "equal to the count of bytes compared. Got:\n\t" + << *RecStepForA << "\n\t" << *RecStepForB << "\n"); + return false; + // FIXME: can support BCmpTyBytes > Step. + // But will need to account for the extra bytes compared at the end. + } + + SrcA = ScevForSrcA->getStart(); + SrcB = ScevForSrcB->getStart(); + LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA + << "\n\t" << *SrcB << "\n"); + + // The load sources must be loop-invants that dominate the loop header. + if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() || + !SE->isAvailableAtLoopEntry(SrcA, CurLoop) || + !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) { + LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable " + "prior to loop header.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n"); + + // bcmp / memcmp take length argument as size_t, so let's conservatively + // assume that the iteration count should be not wider than that. + Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext()); + + // For how many iterations is loop guaranteed not to exit via LoopLatch? + // This is one less than the maximal number of comparisons,and is: n + -1 + const SCEV *LoopExitCount = + SE->getExitCount(CurLoop, CurLoop->getLoopLatch()); + LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: " + << *LoopExitCount << "\n"); + // Exit count, similarly, must be loop-invant that dominates the loop header. + if (LoopExitCount == SE->getCouldNotCompute() || + !LoopExitCount->getType()->isIntOrPtrTy() || + LoopExitCount->getType()->getScalarSizeInBits() > + CmpFuncSizeTy->getScalarSizeInBits() || + !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) { + LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n"); + return false; + } + + // LoopExitCount is always one less than the actual count of iterations. + // Do this before cast, else we will be stuck with 1 + zext(-1 + n) + Iterations = SE->getAddExpr( + LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW); + assert(Iterations != SE->getCouldNotCompute() && + "Shouldn't fail to increment by one."); + + LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n"); + return true; +} + +/// Return true iff the bcmp idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p BCmpInst is set to the root byte-comparison instruction. +/// 2) \p LatchCmpInst is set to the comparison that controls the latch. +/// 3) \p LoadA is set to the first LoadInst. +/// 4) \p LoadB is set to the second LoadInst. +/// 5) \p SrcA is set to the first source location that is being compared. +/// 6) \p SrcB is set to the second source location that is being compared. +/// 7) \p NBytes is set to the number of bytes to compare. +bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst, + CmpInst *&LatchCmpInst, + LoadInst *&LoadA, LoadInst *&LoadB, + const SCEV *&SrcA, const SCEV *&SrcB, + const SCEV *&NBytes) const { + LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n"); + + // Give up if the loop is not in normal form, or has more than 2 blocks. + if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) { + LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n"); + + CmpLoopStructure CmpLoop; + if (!matchBCmpLoopStructure(CmpLoop)) + return false; + + CmpOfLoads CmpOfLoads; + if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads)) + return false; + + if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop)) + return false; + + BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no + LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine + LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with + LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher? + + Type *BCmpValTy = BCmpInst->getOperand(0)->getType(); + LLVMContext &Context = BCmpValTy->getContext(); + uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy); + static constexpr uint64_t ByteTyBits = 8; + + LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy + << " of size " << BCmpTyBits + << " bits (while byte = " << ByteTyBits << " bits).\n"); + // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check + // that we are dealing with a multiple of a byte here. + if (BCmpTyBits % ByteTyBits != 0) { + LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n"); + return false; + // FIXME: could still be done under a run-time check that the total bit + // count is a multiple of a byte i guess? Or handle remainder separately? + } + + // Each comparison is done on this many bytes. + uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits; + LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes + << " bytes, eligible for bcmp conversion.\n"); + + const SCEV *Iterations; + if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations)) + return false; + + // bcmp / memcmp take length argument as size_t, do promotion now. + Type *CmpFuncSizeTy = DL->getIntPtrType(Context); + Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy); + assert(Iterations != SE->getCouldNotCompute() && "Promotion failed."); + // Note that it didn't do ptrtoint cast, we will need to do it manually. + + // We will be comparing *bytes*, not BCmpTy, we need to recalculate size. + // It's a multiplication, and it *could* overflow. But for it to overflow + // we'd want to compare more bytes than could be represented by size_t, But + // allocation functions also take size_t. So how'd you produce such buffer? + // FIXME: we likely need to actually check that we know this won't overflow, + // via llvm::computeOverflowForUnsignedMul(). + NBytes = SE->getMulExpr( + Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW); + assert(NBytes != SE->getCouldNotCompute() && + "Shouldn't fail to increment by one."); + + LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n"); + + if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() || + LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() || + !LoadB->isSimple()) { + StringLiteral L("Unsupported loads in idiom - only support identical, " + "simple loads from address space 0.\n"); + LLVM_DEBUG(dbgs() << L); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads", + BCmpInst->getDebugLoc(), + CurLoop->getHeader()) + << L; + }); + return false; // FIXME: support non-simple loads. + } + + LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n"); + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom", + CurLoop->getStartLoc(), + CurLoop->getHeader()) + << "Loop recognized as a bcmp idiom"; + }); + + return true; +} + +BasicBlock * +LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) { + LLVM_DEBUG(dbgs() << "Transforming control-flow.\n"); + SmallVector<DominatorTree::UpdateType, 8> DTUpdates; + + BasicBlock *PreheaderBB = CurLoop->getLoopPreheader(); + BasicBlock *HeaderBB = CurLoop->getHeader(); + BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); + SmallString<32> LoopName = CurLoop->getName(); + Function *Func = PreheaderBB->getParent(); + LLVMContext &Context = Func->getContext(); + + // Before doing anything, drop SCEV info. + SE->forgetLoop(CurLoop); + + // Here we start with: (0/6) + // PreheaderBB: <preheader> ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // br label %LoopHeaderBB + // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB + // <...> + // br i1 %<...>, label %LoopLatchBB, label %Successor0BB + // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB + // <...> + // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB + // Successor0BB: <exit> ; preds = %LoopHeaderBB + // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] + // <...> + // Successor1BB: <exit> ; preds = %LoopLatchBB + // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] + // <...> + // + // Successor0 and Successor1 may or may not be the same basic block. + + // Decouple the edge between loop preheader basic block and loop header basic + // block. Thus the loop has become unreachable. + assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() && + PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB && + "Preheader bb must end with an unconditional branch to header bb."); + PreheaderBB->getTerminator()->eraseFromParent(); + DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB}); + + // Create a new preheader basic block before loop header basic block. + auto *PhonyPreheaderBB = BasicBlock::Create( + Context, LoopName + ".phonypreheaderbb", Func, HeaderBB); + // And insert an unconditional branch from phony preheader basic block to + // loop header basic block. + IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB); + DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); + + // Create a *single* new empty block that we will substitute as a + // successor basic block for the loop's exits. This one is temporary. + // Much like phony preheader basic block, it is not connected. + auto *PhonySuccessorBB = + BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func, + LoopLatchBB->getNextNode()); + // That block must have *some* non-PHI instruction, or else deleteDeadLoop() + // will mess up cleanup of dbginfo, and verifier will complain. + IRBuilder<>(PhonySuccessorBB).CreateUnreachable(); + + // Create two new empty blocks that we will use to preserve the original + // loop exit control-flow, and preserve the incoming values in the PHI nodes + // in loop's successor exit blocks. These will live one. + auto *ComparedUnequalBB = + BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func, + PhonySuccessorBB->getNextNode()); + auto *ComparedEqualBB = + BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func, + PhonySuccessorBB->getNextNode()); + + // By now we have: (1/6) + // PreheaderBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // [no terminator instruction!] + // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! + // br label %LoopHeaderBB + // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB + // <...> + // br i1 %<...>, label %LoopLatchBB, label %Successor0BB + // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB + // <...> + // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB + // PhonySuccessorBB: ; No preds, UNREACHABLE! + // unreachable + // EqualBB: ; No preds, UNREACHABLE! + // [no terminator instruction!] + // UnequalBB: ; No preds, UNREACHABLE! + // [no terminator instruction!] + // Successor0BB: <exit> ; preds = %LoopHeaderBB + // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] + // <...> + // Successor1BB: <exit> ; preds = %LoopLatchBB + // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] + // <...> + + // What is the mapping/replacement basic block for exiting out of the loop + // from either of old's loop basic blocks? + auto GetReplacementBB = [this, ComparedEqualBB, + ComparedUnequalBB](const BasicBlock *OldBB) { + assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks."); + if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal". + return ComparedEqualBB; + if (OldBB == CurLoop->getHeader()) // "element compared unequal". + return ComparedUnequalBB; + llvm_unreachable("Only had two basic blocks in loop."); + }; + + // What are the exits out of this loop? + SmallVector<Loop::Edge, 2> LoopExitEdges; + CurLoop->getExitEdges(LoopExitEdges); + assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges."); + + // Populate new basic blocks, update the exiting control-flow, PHI nodes. + for (const Loop::Edge &Edge : LoopExitEdges) { + auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first); + auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second); + assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) && + "Unexpected edge."); + + // If we would exit the loop from this loop's basic block, + // what semantically would that mean? Did comparison succeed or fail? + BasicBlock *NewBB = GetReplacementBB(OldLoopBB); + assert(NewBB->empty() && "Should not get same new basic block here twice."); + IRBuilder<> Builder(NewBB); + Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc()); + Builder.CreateBr(SuccessorBB); + DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB}); + // Also, be *REALLY* careful with PHI nodes in successor basic block, + // update them to recieve the same input value, but not from current loop's + // basic block, but from new basic block instead. + SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB); + // Also, change loop control-flow. This loop's basic block shall no longer + // exit from the loop to it's original successor basic block, but to our new + // phony successor basic block. Note that new successor will be unique exit. + OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB, + PhonySuccessorBB); + DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB}); + DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB}); + } + + // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date. + assert(DTUpdates.size() == 8 && "Update count prediction failed."); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + DTU.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // By now we have: (2/6) + // PreheaderBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // [no terminator instruction!] + // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! + // br label %LoopHeaderBB + // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB + // <...> + // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB + // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB + // <...> + // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB + // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB + // unreachable + // EqualBB: ; No preds, UNREACHABLE! + // br label %Successor1BB + // UnequalBB: ; No preds, UNREACHABLE! + // br label %Successor0BB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + // *Finally*, zap the original loop. Record it's parent loop though. + Loop *ParentLoop = CurLoop->getParentLoop(); + LLVM_DEBUG(dbgs() << "Deleting old loop.\n"); + LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting! + deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop. + CurLoop = nullptr; + + // By now we have: (3/6) + // PreheaderBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // [no terminator instruction!] + // PhonyPreheaderBB: ; No preds, UNREACHABLE! + // br label %PhonySuccessorBB + // PhonySuccessorBB: ; preds = %PhonyPreheaderBB + // unreachable + // EqualBB: ; No preds, UNREACHABLE! + // br label %Successor1BB + // UnequalBB: ; No preds, UNREACHABLE! + // br label %Successor0BB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + // Now, actually restore the CFG. + + // Insert an unconditional branch from an actual preheader basic block to + // phony preheader basic block. + IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB); + DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); + // Insert proper conditional branch from phony successor basic block to the + // "dispatch" basic blocks, which were used to preserve incoming values in + // original loop's successor basic blocks. + assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) && + "Yep, that's the one we created to keep deleteDeadLoop() happy."); + PhonySuccessorBB->getTerminator()->eraseFromParent(); + { + IRBuilder<> Builder(PhonySuccessorBB); + Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc()); + Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB); + } + DTUpdates.push_back( + {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB}); + DTUpdates.push_back( + {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB}); + + BasicBlock *DispatchBB = PhonySuccessorBB; + DispatchBB->setName(LoopName + ".bcmpdispatchbb"); + + assert(DTUpdates.size() == 3 && "Update count prediction failed."); + DTU.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // By now we have: (4/6) + // PreheaderBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // br label %PhonyPreheaderBB + // PhonyPreheaderBB: ; preds = %PreheaderBB + // br label %DispatchBB + // DispatchBB: ; preds = %PhonyPreheaderBB + // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB + // EqualBB: ; preds = %DispatchBB + // br label %Successor1BB + // UnequalBB: ; preds = %DispatchBB + // br label %Successor0BB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + // The basic CFG has been restored! Now let's merge redundant basic blocks. + + // Merge phony successor basic block into it's only predecessor, + // phony preheader basic block. It is fully pointlessly redundant. + MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); + + // By now we have: (5/6) + // PreheaderBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // br label %DispatchBB + // DispatchBB: ; preds = %PreheaderBB + // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB + // EqualBB: ; preds = %DispatchBB + // br label %Successor1BB + // UnequalBB: ; preds = %DispatchBB + // br label %Successor0BB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + // Was this loop nested? + if (!ParentLoop) { + // If the loop was *NOT* nested, then let's also merge phony successor + // basic block into it's only predecessor, preheader basic block. + // Also, here we need to update LoopInfo. + LI->removeBlock(PreheaderBB); + MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); + + // By now we have: (6/6) + // DispatchBB: ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB + // EqualBB: ; preds = %DispatchBB + // br label %Successor1BB + // UnequalBB: ; preds = %DispatchBB + // br label %Successor0BB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + return DispatchBB; + } + + // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop. + // To achieve that, we shall keep the preheader basic block (mainly so that + // the loop header block will be guaranteed to have a predecessor outside of + // the loop), and create a phony loop with all these new three basic blocks. + Loop *PhonyLoop = LI->AllocateLoop(); + ParentLoop->addChildLoop(PhonyLoop); + PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI); + PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI); + PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI); + + // But we only have a preheader basic block, a header basic block block and + // two exiting basic blocks. For a proper loop we also need a backedge from + // non-header basic block to header bb. + // Let's just add a never-taken branch from both of the exiting basic blocks. + for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) { + BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator()); + assert(OldTerminator->isUnconditional() && "That's the one we created."); + BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0); + + IRBuilder<> Builder(OldTerminator); + Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc()); + Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB, + DispatchBB); + OldTerminator->eraseFromParent(); + // Yes, the backedge will never be taken. The control-flow is redundant. + // If it can be simplified further, other passes will take care. + DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB}); + DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB}); + DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB}); + } + assert(DTUpdates.size() == 6 && "Update count prediction failed."); + DTU.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // By now we have: (6/6) + // PreheaderBB: <preheader> ; preds = ??? + // <...> + // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) + // %ComparedEqual = icmp eq <...> %memcmp, 0 + // br label %BCmpDispatchBB + // BCmpDispatchBB: <header> ; preds = %PreheaderBB + // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB + // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB + // br i1 %true, label %Successor1BB, label %BCmpDispatchBB + // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB + // br i1 %true, label %Successor0BB, label %BCmpDispatchBB + // Successor0BB: ; preds = %UnequalBB + // %S0PHI = phi <...> [ <...>, %UnequalBB ] + // <...> + // Successor1BB: ; preds = %EqualBB + // %S0PHI = phi <...> [ <...>, %EqualBB ] + // <...> + + // Finally fully DONE! + return DispatchBB; +} + +void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst, + CmpInst *LatchCmpInst, + LoadInst *LoadA, LoadInst *LoadB, + const SCEV *SrcA, const SCEV *SrcB, + const SCEV *NBytes) { + // We will be inserting before the terminator instruction of preheader block. + IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator()); + + LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n"); + LLVM_DEBUG(dbgs() << "Emitting new instructions.\n"); + + // Expand the SCEV expressions for both sources to compare, and produce value + // for the byte len (beware of Iterations potentially being a pointer, and + // account for element size being BCmpTyBytes bytes, which may be not 1 byte) + Value *PtrA, *PtrB, *Len; + { + SCEVExpander SExp(*SE, *DL, "LoopToBCmp"); + SExp.setInsertPoint(&*Builder.GetInsertPoint()); + + auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) { + SExp.SetCurrentDebugLocation(DebugLoc()); + // If the pointer operand of original load had dbgloc - use it. + if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand())) + SExp.SetCurrentDebugLocation(I->getDebugLoc()); + return SExp.expandCodeFor(Src); + }; + PtrA = HandlePtr(LoadA, SrcA); + PtrB = HandlePtr(LoadB, SrcB); + + // For len calculation let's use dbgloc for the loop's latch condition. + Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); + SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); + Len = SExp.expandCodeFor(NBytes); + + Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext()); + assert(SE->getTypeSizeInBits(Len->getType()) == + DL->getTypeSizeInBits(CmpFuncSizeTy) && + "Len should already have the correct size."); + + // Make sure that iteration count is a number, insert ptrtoint cast if not. + if (Len->getType()->isPointerTy()) + Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy); + assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now."); + + Len->setName(Len->getName() + ".bytecount"); + + // There is no legality check needed. We want to compare that the memory + // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal. + // For them to be fully equal, they must match bit-by-bit. And likewise, + // for them to *NOT* be fully equal, they have to differ just by one bit. + // The step of comparison (bits compared at once) simply does not matter. + } + + // For the rest of new instructions, dbgloc should point at the value cmp. + Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc()); + + // Emit the comparison itself. + auto *CmpCall = + cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI) + : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI)); + // FIXME: add {B,Mem}CmpInst with MemoryCompareInst + // (based on MemIntrinsicBase) as base? + // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...) + + // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal. + auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ( + CmpCall, ConstantInt::get(CmpCall->getType(), 0), + PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp")); + + BasicBlock *BB = transformBCmpControlFlow(ComparedEqual); + Builder.ClearInsertionPoint(); + + // We're done. + LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n"); + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall", + CmpCall->getDebugLoc(), BB) + << "Transformed bcmp idiom into a call to " + << ore::NV("NewFunction", CmpCall->getCalledFunction()) + << "() function"; + }); + ++NumBCmp; +} + +/// Recognizes a bcmp idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the bcmp (or memcmp) +/// intrinsic function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeBCmp() { + if (!HasMemCmp && !HasBCmp) + return false; + + ICmpInst *BCmpInst; + CmpInst *LatchCmpInst; + LoadInst *LoadA, *LoadB; + const SCEV *SrcA, *SrcB, *NBytes; + if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, + NBytes)) { + LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n"); + return false; + } + + transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes); + return true; +} diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index 31191b52895c..368b9d4e8df1 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -192,7 +192,8 @@ public: getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); MemorySSA *MSSA = nullptr; Optional<MemorySSAUpdater> MSSAU; if (EnableMSSALoopDependency) { @@ -233,7 +234,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, auto PA = getLoopPassPreservedAnalyses(); PA.preserveSet<CFGAnalyses>(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 9a42365adc1b..1af4b21b432e 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -410,8 +410,6 @@ public: void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); private: - void splitInnerLoopLatch(Instruction *); - void splitInnerLoopHeader(); bool adjustLoopLinks(); void adjustLoopPreheaders(); bool adjustLoopBranches(); @@ -1226,7 +1224,7 @@ bool LoopInterchangeTransform::transform() { if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n"); + LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); if (!InductionPHI) { LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); @@ -1242,11 +1240,55 @@ bool LoopInterchangeTransform::transform() { if (&InductionPHI->getParent()->front() != InductionPHI) InductionPHI->moveBefore(&InductionPHI->getParent()->front()); - // Split at the place were the induction variable is - // incremented/decremented. - // TODO: This splitting logic may not work always. Fix this. - splitInnerLoopLatch(InnerIndexVar); - LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n"); + // Create a new latch block for the inner loop. We split at the + // current latch's terminator and then move the condition and all + // operands that are not either loop-invariant or the induction PHI into the + // new latch block. + BasicBlock *NewLatch = + SplitBlock(InnerLoop->getLoopLatch(), + InnerLoop->getLoopLatch()->getTerminator(), DT, LI); + + SmallSetVector<Instruction *, 4> WorkList; + unsigned i = 0; + auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { + for (; i < WorkList.size(); i++) { + // Duplicate instruction and move it the new latch. Update uses that + // have been moved. + Instruction *NewI = WorkList[i]->clone(); + NewI->insertBefore(NewLatch->getFirstNonPHI()); + assert(!NewI->mayHaveSideEffects() && + "Moving instructions with side-effects may change behavior of " + "the loop nest!"); + for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end(); + UI != UE;) { + Use &U = *UI++; + Instruction *UserI = cast<Instruction>(U.getUser()); + if (!InnerLoop->contains(UserI->getParent()) || + UserI->getParent() == NewLatch || UserI == InductionPHI) + U.set(NewI); + } + // Add operands of moved instruction to the worklist, except if they are + // outside the inner loop or are the induction PHI. + for (Value *Op : WorkList[i]->operands()) { + Instruction *OpI = dyn_cast<Instruction>(Op); + if (!OpI || + this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || + OpI == InductionPHI) + continue; + WorkList.insert(OpI); + } + } + }; + + // FIXME: Should we interchange when we have a constant condition? + Instruction *CondI = dyn_cast<Instruction>( + cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) + ->getCondition()); + if (CondI) + WorkList.insert(CondI); + MoveInstructions(); + WorkList.insert(cast<Instruction>(InnerIndexVar)); + MoveInstructions(); // Splits the inner loops phi nodes out into a separate basic block. BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); @@ -1263,10 +1305,6 @@ bool LoopInterchangeTransform::transform() { return true; } -void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { - SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI); -} - /// \brief Move all instructions except the terminator from FromBB right before /// InsertBefore static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp index 2b3d5e0ce9b7..e8dc879a184b 100644 --- a/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -435,7 +435,8 @@ public: PH->getTerminator()); Value *Initial = new LoadInst( Cand.Load->getType(), InitialPtr, "load_initial", - /* isVolatile */ false, Cand.Load->getAlignment(), PH->getTerminator()); + /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()), + PH->getTerminator()); PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", &L->getHeader()->front()); diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index 507a1e251ca6..885c0e8f4b8b 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -543,7 +543,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) { if (const auto *LI = dyn_cast<LoadInst>(U->getValue())) if (LI->isUnordered() && L->hasLoopInvariantOperands(LI)) if (AA->pointsToConstantMemory(LI->getOperand(0)) || - LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr) + LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; return false; } diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 166b57f20b43..96e2c2a3ac6b 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -1644,7 +1644,8 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index e009947690af..94517996df39 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -55,7 +55,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } @@ -94,17 +94,15 @@ public: auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); - auto *SE = SEWP ? &SEWP->getSE() : nullptr; + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); Optional<MemorySSAUpdater> MSSAU; if (EnableMSSALoopDependency) { MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); MSSAU = MemorySSAUpdater(MSSA); } - return LoopRotation(L, LI, TTI, AC, DT, SE, + return LoopRotation(L, LI, TTI, AC, &DT, &SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false, MaxHeaderSize, false); } diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 046f4c8af492..299f3fc5fb19 100644 --- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -690,7 +690,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &LPMU) { Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency && AR.MSSA) + if (AR.MSSA) MSSAU = MemorySSAUpdater(AR.MSSA); bool DeleteCurrentLoop = false; if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE, @@ -702,7 +702,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LPMU.markLoopAsDeleted(L, "loop-simplifycfg"); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp index 975452e13f09..65e0dee0225a 100644 --- a/lib/Transforms/Scalar/LoopSink.cpp +++ b/lib/Transforms/Scalar/LoopSink.cpp @@ -230,12 +230,9 @@ static bool sinkInstruction(Loop &L, Instruction &I, IC->setName(I.getName()); IC->insertBefore(&*N->getFirstInsertionPt()); // Replaces uses of I with IC in N - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) { - Use &U = *UI++; - auto *I = cast<Instruction>(U.getUser()); - if (I->getParent() == N) - U.set(IC); - } + I.replaceUsesWithIf(IC, [N](Use &U) { + return cast<Instruction>(U.getUser())->getParent() == N; + }); // Replaces uses of I with IC in blocks dominated by N replaceDominatedUsesWith(&I, IC, DT, N); LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 59a387a186b8..7f119175c4a8 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1386,7 +1386,9 @@ void Cost::RateFormula(const Formula &F, // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as // additional instruction (at least fill). - unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1; + // TODO: Need distinguish register class? + unsigned TTIRegNum = TTI->getNumberOfRegisters( + TTI->getRegisterClassForType(false, F.getType())) - 1; if (C.NumRegs > TTIRegNum) { // Cost already exceeded TTIRegNum, then only newly added register can add // new instructions. @@ -3165,6 +3167,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); return; } + assert(IVSrc && "Failed to find IV chain source"); LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); Type *IVTy = IVSrc->getType(); @@ -3265,12 +3268,12 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // requirements for both N and i at the same time. Limiting this code to // equality icmps is not a problem because all interesting loops use // equality icmps, thanks to IndVarSimplify. - if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) + if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) { + // If CI can be saved in some target, like replaced inside hardware loop + // in PowerPC, no need to generate initial formulae for it. + if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) + continue; if (CI->isEquality()) { - // If CI can be saved in some target, like replaced inside hardware loop - // in PowerPC, no need to generate initial formulae for it. - if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) - continue; // Swap the operands if needed to put the OperandValToReplace on the // left, for consistency. Value *NV = CI->getOperand(1); @@ -3298,6 +3301,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { Factors.insert(-(uint64_t)Factors[i]); Factors.insert(-1); } + } // Get or create an LSRUse. std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); @@ -4834,6 +4838,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } } + assert(Best && "Failed to find best LSRUse candidate"); LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best << " will yield profitable reuse.\n"); @@ -5740,7 +5745,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { *L->getHeader()->getParent()); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); - auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo); } diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 86891eb451bb..8d88be420314 100644 --- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount( bool UseUpperBound = false; bool ExplicitUnroll = computeUnrollCount( L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, - OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); + /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); if (ExplicitUnroll || UseUpperBound) { // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it // for the unroller instead. @@ -293,9 +293,9 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (Latch != Exit || SubLoopLatch != SubLoopExit) return LoopUnrollResult::Unmodified; - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, nullptr, nullptr, OptLevel, - None, None, None, None, None, None); + TargetTransformInfo::UnrollingPreferences UP = + gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, + None, None, None, None, None, None, None); if (AllowUnrollAndJam.getNumOccurrences() > 0) UP.UnrollAndJam = AllowUnrollAndJam; if (UnrollAndJamThreshold.getNumOccurrences() > 0) diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2fa7436213dd..a6d4164c3645 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -178,7 +178,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, - Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) { + Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling, + Optional<bool> UserAllowProfileBasedPeeling, + Optional<unsigned> UserFullUnrollMaxCount) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults @@ -202,6 +204,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.UpperBound = false; UP.AllowPeeling = true; UP.UnrollAndJam = false; + UP.PeelProfiledIterations = true; UP.UnrollAndJamInnerLoopThreshold = 60; // Override with any target specific settings @@ -257,6 +260,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.UpperBound = *UserUpperBound; if (UserAllowPeeling.hasValue()) UP.AllowPeeling = *UserAllowPeeling; + if (UserAllowProfileBasedPeeling.hasValue()) + UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; + if (UserFullUnrollMaxCount.hasValue()) + UP.FullUnrollMaxCount = *UserFullUnrollMaxCount; return UP; } @@ -730,7 +737,7 @@ bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, - unsigned &TripMultiple, unsigned LoopSize, + bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -781,18 +788,34 @@ bool llvm::computeUnrollCount( // Also we need to check if we exceed FullUnrollMaxCount. // If using the upper bound to unroll, TripMultiple should be set to 1 because // we do not know when loop may exit. - // MaxTripCount and ExactTripCount cannot both be non zero since we only + + // We can unroll by the upper bound amount if it's generally allowed or if + // we know that the loop is executed either the upper bound or zero times. + // (MaxOrZero unrolling keeps only the first loop test, so the number of + // loop tests remains the same compared to the non-unrolled version, whereas + // the generic upper bound unrolling keeps all but the last loop test so the + // number of loop tests goes up which may end up being worse on targets with + // constrained branch predictor resources so is controlled by an option.) + // In addition we only unroll small upper bounds. + unsigned FullUnrollMaxTripCount = MaxTripCount; + if (!(UP.UpperBound || MaxOrZero) || + FullUnrollMaxTripCount > UnrollMaxUpperBound) + FullUnrollMaxTripCount = 0; + + // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only // compute the former when the latter is zero. unsigned ExactTripCount = TripCount; - assert((ExactTripCount == 0 || MaxTripCount == 0) && - "ExtractTripCount and MaxTripCount cannot both be non zero."); - unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; + assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && + "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); + + unsigned FullUnrollTripCount = + ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; UP.Count = FullUnrollTripCount; if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -806,7 +829,7 @@ bool llvm::computeUnrollCount( unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -882,6 +905,8 @@ bool llvm::computeUnrollCount( "because " "unrolled size is too large."; }); + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + << "\n"); return ExplicitUnroll; } assert(TripCount == 0 && @@ -903,6 +928,12 @@ bool llvm::computeUnrollCount( return false; } + // Don't unroll a small upper bound loop unless user or TTI asked to do so. + if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + UP.Count = 0; + return false; + } + // Check if the runtime trip count is too small when profile is available. if (L->getHeader()->getParent()->hasProfileData()) { if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { @@ -966,7 +997,11 @@ bool llvm::computeUnrollCount( if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + + if (MaxTripCount && UP.Count > MaxTripCount) + UP.Count = MaxTripCount; + + LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count << "\n"); if (UP.Count < 2) UP.Count = 0; @@ -976,13 +1011,14 @@ bool llvm::computeUnrollCount( static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - bool PreserveLCSSA, int OptLevel, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound, - Optional<bool> ProvidedAllowPeeling) { + Optional<bool> ProvidedAllowPeeling, + Optional<bool> ProvidedAllowProfileBasedPeeling, + Optional<unsigned> ProvidedFullUnrollMaxCount) { LLVM_DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); @@ -1007,7 +1043,8 @@ static LoopUnrollResult tryToUnrollLoop( TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, - ProvidedAllowPeeling); + ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, + ProvidedFullUnrollMaxCount); // Exit early if unrolling is disabled. For OptForSize, we pick the loop size // as threshold later on. @@ -1028,10 +1065,10 @@ static LoopUnrollResult tryToUnrollLoop( return LoopUnrollResult::Unmodified; } - // When optimizing for size, use LoopSize as threshold, to (fully) unroll - // loops, if it does not increase code size. + // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold + // later), to (fully) unroll loops, if it does not increase code size. if (OptForSize) - UP.Threshold = std::max(UP.Threshold, LoopSize); + UP.Threshold = std::max(UP.Threshold, LoopSize + 1); if (NumInlineCandidates != 0) { LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); @@ -1040,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop( // Find trip count and trip multiple if count is not available unsigned TripCount = 0; - unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -1070,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop( // Try to find the trip count upper bound if we cannot find the exact trip // count. + unsigned MaxTripCount = 0; bool MaxOrZero = false; if (!TripCount) { MaxTripCount = SE.getSmallConstantMaxTripCount(L); MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); - // We can unroll by the upper bound amount if it's generally allowed or if - // we know that the loop is executed either the upper bound or zero times. - // (MaxOrZero unrolling keeps only the first loop test, so the number of - // loop tests remains the same compared to the non-unrolled version, whereas - // the generic upper bound unrolling keeps all but the last loop test so the - // number of loop tests goes up which may end up being worse on targets with - // constrained branch predictor resources so is controlled by an option.) - // In addition we only unroll small upper bounds. - if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) { - MaxTripCount = 0; - } } // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; @@ -1139,7 +1165,7 @@ static LoopUnrollResult tryToUnrollLoop( // If the loop was peeled, we already "used up" the profile information // we had, so we don't want to unroll or peel again. if (UnrollResult != LoopUnrollResult::FullyUnrolled && - (IsCountSetExplicitly || UP.PeelCount)) + (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount))) L->setLoopAlreadyUnrolled(); return UnrollResult; @@ -1169,18 +1195,24 @@ public: Optional<bool> ProvidedRuntime; Optional<bool> ProvidedUpperBound; Optional<bool> ProvidedAllowPeeling; + Optional<bool> ProvidedAllowProfileBasedPeeling; + Optional<unsigned> ProvidedFullUnrollMaxCount; LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false, bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, Optional<bool> UpperBound = None, - Optional<bool> AllowPeeling = None) + Optional<bool> AllowPeeling = None, + Optional<bool> AllowProfileBasedPeeling = None, + Optional<unsigned> ProvidedFullUnrollMaxCount = None) : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced), ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound), - ProvidedAllowPeeling(AllowPeeling) { + ProvidedAllowPeeling(AllowPeeling), + ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling), + ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -1203,10 +1235,11 @@ public: bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, - PreserveLCSSA, OptLevel, OnlyWhenForced, - ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); + L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel, + OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold, + ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, + ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, + ProvidedFullUnrollMaxCount); if (Result == LoopUnrollResult::FullyUnrolled) LPM.markLoopAsDeleted(*L); @@ -1283,14 +1316,16 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, std::string LoopName = L.getName(); - bool Changed = - tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, - /*BFI*/ nullptr, /*PSI*/ nullptr, - /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, - ForgetSCEV, /*Count*/ None, - /*Threshold*/ None, /*AllowPartial*/ false, - /*Runtime*/ false, /*UpperBound*/ false, - /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified; + bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*BFI*/ nullptr, /*PSI*/ nullptr, + /*PreserveLCSSA*/ true, OptLevel, + OnlyWhenForced, ForgetSCEV, /*Count*/ None, + /*Threshold*/ None, /*AllowPartial*/ false, + /*Runtime*/ false, /*UpperBound*/ false, + /*AllowPeeling*/ false, + /*AllowProfileBasedPeeling*/ false, + /*FullUnrollMaxCount*/ None) != + LoopUnrollResult::Unmodified; if (!Changed) return PreservedAnalyses::all(); @@ -1430,7 +1465,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, - UnrollOpts.AllowUpperBound, LocalAllowPeeling); + UnrollOpts.AllowUpperBound, LocalAllowPeeling, + UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount); Changed |= Result != LoopUnrollResult::Unmodified; // The parent must not be damaged by unrolling! diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index b5b8e720069c..b410df0c5f68 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -420,7 +420,8 @@ enum OperatorChain { /// cost of creating an entirely new loop. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, OperatorChain &ParentChain, - DenseMap<Value *, Value *> &Cache) { + DenseMap<Value *, Value *> &Cache, + MemorySSAUpdater *MSSAU) { auto CacheIt = Cache.find(Cond); if (CacheIt != Cache.end()) return CacheIt->second; @@ -438,7 +439,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // TODO: Handle: br (VARIANT|INVARIANT). // Hoist simple values out. - if (L->makeLoopInvariant(Cond, Changed)) { + if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) { Cache[Cond] = Cond; return Cond; } @@ -478,7 +479,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // which will cause the branch to go away in one loop and the condition to // simplify in the other one. if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed, - ParentChain, Cache)) { + ParentChain, Cache, MSSAU)) { Cache[Cond] = LHS; return LHS; } @@ -486,7 +487,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // operand(1). ParentChain = NewChain; if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed, - ParentChain, Cache)) { + ParentChain, Cache, MSSAU)) { Cache[Cond] = RHS; return RHS; } @@ -500,12 +501,12 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, /// Cond is a condition that occurs in L. If it is invariant in the loop, or has /// an invariant piece, return the invariant along with the operator chain type. /// Otherwise, return null. -static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond, - Loop *L, - bool &Changed) { +static std::pair<Value *, OperatorChain> +FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, + MemorySSAUpdater *MSSAU) { DenseMap<Value *, Value *> Cache; OperatorChain OpChain = OC_OpChainNone; - Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache); + Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU); // In case we do find a LIV, it can not be obtained by walking up a mixed // operator chain. @@ -525,7 +526,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (EnableMSSALoopDependency) { MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = make_unique<MemorySSAUpdater>(MSSA); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); assert(DT && "Cannot update MemorySSA without a valid DomTree."); } currentLoop = L; @@ -694,8 +695,9 @@ bool LoopUnswitch::processCurrentLoop() { } for (IntrinsicInst *Guard : Guards) { - Value *LoopCond = - FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { // NB! Unswitching (if successful) could have erased some of the @@ -735,8 +737,9 @@ bool LoopUnswitch::processCurrentLoop() { if (BI->isConditional()) { // See if this, or some part of it, is loop invariant. If so, we can // unswitch on it if we desire. - Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && !EqualityPropUnSafe(*LoopCond) && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { ++NumBranches; @@ -748,7 +751,7 @@ bool LoopUnswitch::processCurrentLoop() { Value *LoopCond; OperatorChain OpChain; std::tie(LoopCond, OpChain) = - FindLIVLoopCondition(SC, currentLoop, Changed); + FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get()); unsigned NumCases = SI->getNumCases(); if (LoopCond && NumCases) { @@ -808,8 +811,9 @@ bool LoopUnswitch::processCurrentLoop() { for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); BBI != E; ++BBI) if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { ++NumSelects; @@ -1123,8 +1127,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { if (!BI->isConditional()) return false; - Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -1157,8 +1162,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { // If this isn't switching on an invariant condition, we can't unswitch it. - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -1240,6 +1246,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, LoopBlocks.clear(); NewBlocks.clear(); + if (MSSAU && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + // First step, split the preheader and exit blocks, and add these blocks to // the LoopBlocks list. BasicBlock *NewPreheader = @@ -1607,36 +1616,30 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // If BI's parent is the only pred of the successor, fold the two blocks // together. BasicBlock *Pred = BI->getParent(); + (void)Pred; BasicBlock *Succ = BI->getSuccessor(0); BasicBlock *SinglePred = Succ->getSinglePredecessor(); if (!SinglePred) continue; // Nothing to do. assert(SinglePred == Pred && "CFG broken"); - LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " - << Succ->getName() << "\n"); - - // Resolve any single entry PHI nodes in Succ. - while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) - ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM, - MSSAU.get()); - - // If Succ has any successors with PHI nodes, update them to have - // entries coming from Pred instead of Succ. - Succ->replaceAllUsesWith(Pred); - - // Move all of the successor contents from Succ to Pred. - Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), - Succ->begin(), Succ->end()); - if (MSSAU) - MSSAU->moveAllAfterMergeBlocks(Succ, Pred, BI); + // Make the LPM and Worklist updates specific to LoopUnswitch. LPM->deleteSimpleAnalysisValue(BI, L); RemoveFromWorklist(BI, Worklist); - BI->eraseFromParent(); - - // Remove Succ from the loop tree. - LI->removeBlock(Succ); LPM->deleteSimpleAnalysisValue(Succ, L); - Succ->eraseFromParent(); + auto SuccIt = Succ->begin(); + while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) { + for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It) + if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It))) + Worklist.push_back(Use); + for (User *U : PN->users()) + Worklist.push_back(cast<Instruction>(U)); + LPM->deleteSimpleAnalysisValue(PN, L); + RemoveFromWorklist(PN, Worklist); + ++NumSimplify; + } + // Merge the block and make the remaining analyses updates. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get()); ++NumSimplify; continue; } diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 896dd8bcb922..2ccb7cae3079 100644 --- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -112,37 +112,6 @@ static cl::opt<unsigned> LVLoopDepthThreshold( "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"), cl::init(2), cl::Hidden); -/// Create MDNode for input string. -static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = { - MDString::get(Context, Name), - ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); -} - -/// Set input string into loop metadata by keeping other values intact. -void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString, - unsigned V) { - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, retain it. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - MDs.push_back(Node); - } - } - // Add new metadata. - MDs.push_back(createStringMetadata(TheLoop, MDString, V)); - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - TheLoop->setLoopID(NewLoopID); -} - namespace { struct LoopVersioningLICM : public LoopPass { diff --git a/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp new file mode 100644 index 000000000000..d0fcf38b5a7b --- /dev/null +++ b/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -0,0 +1,170 @@ +//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls +// and provides constant propagation and basic CFG cleanup on the result. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "lower-is-constant-intrinsic" + +STATISTIC(IsConstantIntrinsicsHandled, + "Number of 'is.constant' intrinsic calls handled"); +STATISTIC(ObjectSizeIntrinsicsHandled, + "Number of 'objectsize' intrinsic calls handled"); + +static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) { + Value *Op = II->getOperand(0); + + return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType()) + : ConstantInt::getFalse(II->getType()); +} + +static bool replaceConditionalBranchesOnConstant(Instruction *II, + Value *NewValue) { + bool HasDeadBlocks = false; + SmallSetVector<Instruction *, 8> Worklist; + replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr, + &Worklist); + for (auto I : Worklist) { + BranchInst *BI = dyn_cast<BranchInst>(I); + if (!BI) + continue; + if (BI->isUnconditional()) + continue; + + BasicBlock *Target, *Other; + if (match(BI->getOperand(0), m_Zero())) { + Target = BI->getSuccessor(1); + Other = BI->getSuccessor(0); + } else if (match(BI->getOperand(0), m_One())) { + Target = BI->getSuccessor(0); + Other = BI->getSuccessor(1); + } else { + Target = nullptr; + Other = nullptr; + } + if (Target && Target != Other) { + BasicBlock *Source = BI->getParent(); + Other->removePredecessor(Source); + BI->eraseFromParent(); + BranchInst::Create(Target, Source); + if (pred_begin(Other) == pred_end(Other)) + HasDeadBlocks = true; + } + } + return HasDeadBlocks; +} + +static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) { + bool HasDeadBlocks = false; + const auto &DL = F.getParent()->getDataLayout(); + SmallVector<WeakTrackingVH, 8> Worklist; + + ReversePostOrderTraversal<Function *> RPOT(&F); + for (BasicBlock *BB : RPOT) { + for (Instruction &I: *BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::is_constant: + case Intrinsic::objectsize: + Worklist.push_back(WeakTrackingVH(&I)); + break; + } + } + } + for (WeakTrackingVH &VH: Worklist) { + // Items on the worklist can be mutated by earlier recursive replaces. + // This can remove the intrinsic as dead (VH == null), but also replace + // the intrinsic in place. + if (!VH) + continue; + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH); + if (!II) + continue; + Value *NewValue; + switch (II->getIntrinsicID()) { + default: + continue; + case Intrinsic::is_constant: + NewValue = lowerIsConstantIntrinsic(II); + IsConstantIntrinsicsHandled++; + break; + case Intrinsic::objectsize: + NewValue = lowerObjectSizeCall(II, DL, TLI, true); + ObjectSizeIntrinsicsHandled++; + break; + } + HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue); + } + if (HasDeadBlocks) + removeUnreachableBlocks(F); + return !Worklist.empty(); +} + +PreservedAnalyses +LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) { + if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F))) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +/// Legacy pass for lowering is.constant intrinsics out of the IR. +/// +/// When this pass is run over a function it converts is.constant intrinsics +/// into 'true' or 'false'. This is completements the normal constand folding +/// to 'true' as part of Instruction Simplify passes. +class LowerConstantIntrinsics : public FunctionPass { +public: + static char ID; + LowerConstantIntrinsics() : FunctionPass(ID) { + initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + return lowerConstantIntrinsics(F, TLI); + } +}; +} // namespace + +char LowerConstantIntrinsics::ID = 0; +INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics", + "Lower constant intrinsics", false, false) + +FunctionPass *llvm::createLowerConstantIntrinsicsPass() { + return new LowerConstantIntrinsics(); +} diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0d67c0d740ec..d85f20b3f80c 100644 --- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/MisExpect.h" using namespace llvm; @@ -71,15 +72,20 @@ static bool handleSwitchExpect(SwitchInst &SI) { unsigned n = SI.getNumCases(); // +1 for default case. SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); - if (Case == *SI.case_default()) - Weights[0] = LikelyBranchWeight; - else - Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; + uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1; + Weights[Index] = LikelyBranchWeight; + + SI.setMetadata( + LLVMContext::MD_misexpect, + MDBuilder(CI->getContext()) + .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight)); + + SI.setCondition(ArgValue); + misexpect::checkFrontendInstrumentation(SI); SI.setMetadata(LLVMContext::MD_prof, MDBuilder(CI->getContext()).createBranchWeights(Weights)); - SI.setCondition(ArgValue); return true; } @@ -155,7 +161,7 @@ static void handlePhiDef(CallInst *Expect) { return Result; }; - auto *PhiDef = dyn_cast<PHINode>(V); + auto *PhiDef = cast<PHINode>(V); // Get the first dominating conditional branch of the operand // i's incoming block. @@ -280,19 +286,28 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { MDBuilder MDB(CI->getContext()); MDNode *Node; + MDNode *ExpNode; if ((ExpectedValue->getZExtValue() == ValueComparedTo) == - (Predicate == CmpInst::ICMP_EQ)) + (Predicate == CmpInst::ICMP_EQ)) { Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); - else + ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight); + } else { Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); + ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight); + } - BSI.setMetadata(LLVMContext::MD_prof, Node); + BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode); if (CmpI) CmpI->setOperand(0, ArgValue); else BSI.setCondition(ArgValue); + + misexpect::checkFrontendInstrumentation(BSI); + + BSI.setMetadata(LLVMContext::MD_prof, Node); + return true; } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 5a055139be4f..2364748efb05 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -69,90 +69,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); -static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, - bool &VariableIdxFound, - const DataLayout &DL) { - // Skip over the first indices. - gep_type_iterator GTI = gep_type_begin(GEP); - for (unsigned i = 1; i != Idx; ++i, ++GTI) - /*skip along*/; - - // Compute the offset implied by the rest of the indices. - int64_t Offset = 0; - for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { - ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (!OpC) - return VariableIdxFound = true; - if (OpC->isZero()) continue; // No offset. - - // Handle struct indices, which add their field offset to the pointer. - if (StructType *STy = GTI.getStructTypeOrNull()) { - Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - continue; - } - - // Otherwise, we have a sequential type like an array or vector. Multiply - // the index by the ElementSize. - uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); - Offset += Size*OpC->getSExtValue(); - } - - return Offset; -} - -/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and -/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 -/// might be &A[40]. In this case offset would be -8. -static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const DataLayout &DL) { - Ptr1 = Ptr1->stripPointerCasts(); - Ptr2 = Ptr2->stripPointerCasts(); - - // Handle the trivial case first. - if (Ptr1 == Ptr2) { - Offset = 0; - return true; - } - - GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); - GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); - - bool VariableIdxFound = false; - - // If one pointer is a GEP and the other isn't, then see if the GEP is a - // constant offset from the base, as in "P" and "gep P, 1". - if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { - Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL); - return !VariableIdxFound; - } - - if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { - Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL); - return !VariableIdxFound; - } - - // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical - // base. After that base, they may have some number of common (and - // potentially variable) indices. After that they handle some constant - // offset, which determines their offset from each other. At this point, we - // handle no other case. - if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) - return false; - - // Skip any common indices and track the GEP types. - unsigned Idx = 1; - for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) - if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) - break; - - int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL); - int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL); - if (VariableIdxFound) return false; - - Offset = Offset2-Offset1; - return true; -} - namespace { /// Represents a range of memset'd bytes with the ByteVal value. @@ -419,12 +335,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, break; // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, - DL)) + Optional<int64_t> Offset = + isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL); + if (!Offset) break; - Ranges.addStore(Offset, NextStore); + Ranges.addStore(*Offset, NextStore); } else { MemSetInst *MSI = cast<MemSetInst>(BI); @@ -433,11 +349,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, break; // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL)) + Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL); + if (!Offset) break; - Ranges.addMemSet(Offset, MSI); + Ranges.addMemSet(*Offset, MSI); } } @@ -597,9 +513,13 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, ToLift.push_back(C); for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k) - if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) - if (A->getParent() == SI->getParent()) + if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) { + if (A->getParent() == SI->getParent()) { + // Cannot hoist user of P above P + if(A == P) return false; Args.insert(A); + } + } } // We made it, we need to lift @@ -979,7 +899,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, // If the destination wasn't sufficiently aligned then increase its alignment. if (!isDestSufficientlyAligned) { assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); - cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); + cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign)); } // Drop any cached information about the call, because we may have changed @@ -1516,7 +1436,7 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) { return false; auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto LookupAliasAnalysis = [this]() -> AliasAnalysis & { return getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp index 3d047a193267..98a45b391319 100644 --- a/lib/Transforms/Scalar/MergeICmps.cpp +++ b/lib/Transforms/Scalar/MergeICmps.cpp @@ -897,7 +897,7 @@ public: bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; - const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); // MergeICmps does not need the DominatorTree, but we update it if it's // already available. diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 30645f4400e3..9799ea7960ec 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -14,9 +14,11 @@ // diamond (hammock) and merges them into a single load in the header. Similar // it sinks and merges two stores to the tail block (footer). The algorithm // iterates over the instructions of one side of the diamond and attempts to -// find a matching load/store on the other side. It hoists / sinks when it -// thinks it safe to do so. This optimization helps with eg. hiding load -// latencies, triggering if-conversion, and reducing static code size. +// find a matching load/store on the other side. New tail/footer block may be +// insterted if the tail/footer block has more predecessors (not only the two +// predecessors that are forming the diamond). It hoists / sinks when it thinks +// it safe to do so. This optimization helps with eg. hiding load latencies, +// triggering if-conversion, and reducing static code size. // // NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist. // @@ -103,7 +105,9 @@ class MergedLoadStoreMotion { // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. const int MagicCompileTimeControl = 250; + const bool SplitFooterBB; public: + MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} bool run(Function &F, AliasAnalysis &AA); private: @@ -114,7 +118,9 @@ private: PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); bool isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc); - bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); + bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const; + void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand, + StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); }; } // end anonymous namespace @@ -217,74 +223,82 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, } /// +/// Check if 2 stores can be sunk together with corresponding GEPs +/// +bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0, + StoreInst *S1) const { + auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); + auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); + return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && + (A0->getParent() == S0->getParent()) && A1->hasOneUse() && + (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0); +} + +/// /// Merge two stores to same address and sink into \p BB /// /// Also sinks GEP instruction computing the store address /// -bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, - StoreInst *S1) { +void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { // Only one definition? auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); - if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && - (A0->getParent() == S0->getParent()) && A1->hasOneUse() && - (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { - LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); - dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; - dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); - // Hoist the instruction. - BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); - // Intersect optional metadata. - S0->andIRFlags(S1); - S0->dropUnknownNonDebugMetadata(); - - // Create the new store to be inserted at the join point. - StoreInst *SNew = cast<StoreInst>(S0->clone()); - Instruction *ANew = A0->clone(); - SNew->insertBefore(&*InsertPt); - ANew->insertBefore(SNew); - - assert(S0->getParent() == A0->getParent()); - assert(S1->getParent() == A1->getParent()); - - // New PHI operand? Use it. - if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) - SNew->setOperand(0, NewPN); - S0->eraseFromParent(); - S1->eraseFromParent(); - A0->replaceAllUsesWith(ANew); - A0->eraseFromParent(); - A1->replaceAllUsesWith(ANew); - A1->eraseFromParent(); - return true; - } - return false; + LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + // Hoist the instruction. + BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); + // Intersect optional metadata. + S0->andIRFlags(S1); + S0->dropUnknownNonDebugMetadata(); + + // Create the new store to be inserted at the join point. + StoreInst *SNew = cast<StoreInst>(S0->clone()); + Instruction *ANew = A0->clone(); + SNew->insertBefore(&*InsertPt); + ANew->insertBefore(SNew); + + assert(S0->getParent() == A0->getParent()); + assert(S1->getParent() == A1->getParent()); + + // New PHI operand? Use it. + if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) + SNew->setOperand(0, NewPN); + S0->eraseFromParent(); + S1->eraseFromParent(); + A0->replaceAllUsesWith(ANew); + A0->eraseFromParent(); + A1->replaceAllUsesWith(ANew); + A1->eraseFromParent(); } /// /// True when two stores are equivalent and can sink into the footer /// -/// Starting from a diamond tail block, iterate over the instructions in one -/// predecessor block and try to match a store in the second predecessor. +/// Starting from a diamond head block, iterate over the instructions in one +/// successor block and try to match a store in the second successor. /// -bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { +bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) { bool MergedStores = false; - assert(T && "Footer of a diamond cannot be empty"); - - pred_iterator PI = pred_begin(T), E = pred_end(T); - assert(PI != E); - BasicBlock *Pred0 = *PI; - ++PI; - BasicBlock *Pred1 = *PI; - ++PI; + BasicBlock *TailBB = getDiamondTail(HeadBB); + BasicBlock *SinkBB = TailBB; + assert(SinkBB && "Footer of a diamond cannot be empty"); + + succ_iterator SI = succ_begin(HeadBB); + assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors"); + BasicBlock *Pred0 = *SI; + ++SI; + assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor"); + BasicBlock *Pred1 = *SI; // tail block of a diamond/hammock? if (Pred0 == Pred1) return false; // No. - if (PI != E) - return false; // No. More than 2 predecessors. - - // #Instructions in Succ1 for Compile Time Control + // bail out early if we can not merge into the footer BB + if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3)) + return false; + // #Instructions in Pred1 for Compile Time Control auto InstsNoDbg = Pred1->instructionsWithoutDebug(); int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end()); int NStores = 0; @@ -304,14 +318,23 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { if (NStores * Size1 >= MagicCompileTimeControl) break; if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { - bool Res = sinkStore(T, S0, S1); - MergedStores |= Res; - // Don't attempt to sink below stores that had to stick around - // But after removal of a store and some of its feeding - // instruction search again from the beginning since the iterator - // is likely stale at this point. - if (!Res) + if (!canSinkStoresAndGEPs(S0, S1)) + // Don't attempt to sink below stores that had to stick around + // But after removal of a store and some of its feeding + // instruction search again from the beginning since the iterator + // is likely stale at this point. break; + + if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) { + // We have more than 2 predecessors. Insert a new block + // postdominating 2 predecessors we're going to sink from. + SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split"); + if (!SinkBB) + break; + } + + MergedStores = true; + sinkStoresAndGEPs(SinkBB, S0, S1); RBI = Pred0->rbegin(); RBE = Pred0->rend(); LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); @@ -328,13 +351,15 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. + // This loop doesn't care about newly inserted/split blocks + // since they never will be diamond heads. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { BasicBlock *BB = &*FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible if (isDiamondHead(BB)) { - Changed |= mergeStores(getDiamondTail(BB)); + Changed |= mergeStores(BB); } } return Changed; @@ -342,9 +367,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { namespace { class MergedLoadStoreMotionLegacyPass : public FunctionPass { + const bool SplitFooterBB; public: static char ID; // Pass identification, replacement for typeid - MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) { + MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false) + : FunctionPass(ID), SplitFooterBB(SplitFooterBB) { initializeMergedLoadStoreMotionLegacyPassPass( *PassRegistry::getPassRegistry()); } @@ -355,13 +382,14 @@ public: bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; - MergedLoadStoreMotion Impl; + MergedLoadStoreMotion Impl(SplitFooterBB); return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults()); } private: void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); + if (!SplitFooterBB) + AU.setPreservesCFG(); AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -373,8 +401,8 @@ char MergedLoadStoreMotionLegacyPass::ID = 0; /// /// createMergedLoadStoreMotionPass - The public interface to this file. /// -FunctionPass *llvm::createMergedLoadStoreMotionPass() { - return new MergedLoadStoreMotionLegacyPass(); +FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) { + return new MergedLoadStoreMotionLegacyPass(SplitFooterBB); } INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", @@ -385,13 +413,14 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", PreservedAnalyses MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { - MergedLoadStoreMotion Impl; + MergedLoadStoreMotion Impl(Options.SplitFooterBB); auto &AA = AM.getResult<AAManager>(F); if (!Impl.run(F, AA)) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); + if (!Options.SplitFooterBB) + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp index 94436b55752a..1260bd39cdee 100644 --- a/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/lib/Transforms/Scalar/NaryReassociate.cpp @@ -170,7 +170,7 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) { auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); return Impl.runImpl(F, AC, DT, SE, TLI, TTI); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 08ac2b666fce..b213264de557 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -89,6 +89,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -122,6 +123,7 @@ using namespace llvm; using namespace llvm::GVNExpression; using namespace llvm::VNCoercion; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "newgvn" @@ -656,7 +658,7 @@ public: TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA, const DataLayout &DL) : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL), - PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), + PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {} bool runGVN(); @@ -1332,7 +1334,7 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, E->setOpcode(0); E->op_push_back(PointerOp); if (LI) - E->setAlignment(LI->getAlignment()); + E->setAlignment(MaybeAlign(LI->getAlignment())); // TODO: Value number heap versions. We may be able to discover // things alias analysis can't on it's own (IE that a store and a @@ -1637,8 +1639,11 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { if (AA->doesNotAccessMemory(CI)) { return createCallExpression(CI, TOPClass->getMemoryLeader()); } else if (AA->onlyReadsMemory(CI)) { - MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI); - return createCallExpression(CI, DefiningAccess); + if (auto *MA = MSSA->getMemoryAccess(CI)) { + auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA); + return createCallExpression(CI, DefiningAccess); + } else // MSSA determined that CI does not access memory. + return createCallExpression(CI, TOPClass->getMemoryLeader()); } return nullptr; } @@ -1754,7 +1759,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, return true; }); // If we are left with no operands, it's dead. - if (empty(Filtered)) { + if (Filtered.empty()) { // If it has undef at this point, it means there are no-non-undef arguments, // and thus, the value of the phi node must be undef. if (HasUndef) { @@ -2464,9 +2469,9 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const { // Process the outgoing edges of a block for reachability. void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { // Evaluate reachability of terminator instruction. - BranchInst *BR; - if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) { - Value *Cond = BR->getCondition(); + Value *Cond; + BasicBlock *TrueSucc, *FalseSucc; + if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) { Value *CondEvaluated = findConditionEquivalence(Cond); if (!CondEvaluated) { if (auto *I = dyn_cast<Instruction>(Cond)) { @@ -2479,8 +2484,6 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { } } ConstantInt *CI; - BasicBlock *TrueSucc = BR->getSuccessor(0); - BasicBlock *FalseSucc = BR->getSuccessor(1); if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) { if (CI->isOne()) { LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI @@ -4196,7 +4199,7 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) { return false; return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), &getAnalysis<AAResultsWrapperPass>().getAAResults(), &getAnalysis<MemorySSAWrapperPass>().getMSSA(), F.getParent()->getDataLayout()) diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 039123218544..68a0f5151ad5 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -161,7 +161,7 @@ public: return false; TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); return runPartiallyInlineLibCalls(F, TLI, TTI); diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp index b544f0a39ea8..beb299272ed8 100644 --- a/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -131,7 +131,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); for (Loop *I : *LI) { runOnLoopAndSubLoops(I); } @@ -240,7 +240,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, BasicBlock *Pred) { // A conservative bound on the loop as a whole. - const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); + const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L); if (MaxTrips != SE->getCouldNotCompute() && SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( CountedLoopTripWidth)) @@ -478,7 +478,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { return false; const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); bool Modified = false; diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index fa8c9e2a5fe4..124f625ef7b6 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -861,7 +861,7 @@ static Value *NegateValue(Value *V, Instruction *BI, // this use. We do this by moving it to the entry block (if it is a // non-instruction value) or right after the definition. These negates will // be zapped by reassociate later, so we don't need much finesse here. - BinaryOperator *TheNeg = cast<BinaryOperator>(U); + Instruction *TheNeg = cast<Instruction>(U); // Verify that the negate is in this function, V might be a constant expr. if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) @@ -1938,88 +1938,132 @@ void ReassociatePass::EraseInst(Instruction *I) { MadeChange = true; } -// Canonicalize expressions of the following form: -// x + (-Constant * y) -> x - (Constant * y) -// x - (-Constant * y) -> x + (Constant * y) -Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { - if (!I->hasOneUse() || I->getType()->isVectorTy()) - return nullptr; - - // Must be a fmul or fdiv instruction. - unsigned Opcode = I->getOpcode(); - if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv) - return nullptr; - - auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0)); - auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1)); - - // Both operands are constant, let it get constant folded away. - if (C0 && C1) - return nullptr; - - ConstantFP *CF = C0 ? C0 : C1; - - // Must have one constant operand. - if (!CF) - return nullptr; +/// Recursively analyze an expression to build a list of instructions that have +/// negative floating-point constant operands. The caller can then transform +/// the list to create positive constants for better reassociation and CSE. +static void getNegatibleInsts(Value *V, + SmallVectorImpl<Instruction *> &Candidates) { + // Handle only one-use instructions. Combining negations does not justify + // replicating instructions. + Instruction *I; + if (!match(V, m_OneUse(m_Instruction(I)))) + return; - // Must be a negative ConstantFP. - if (!CF->isNegative()) - return nullptr; + // Handle expressions of multiplications and divisions. + // TODO: This could look through floating-point casts. + const APFloat *C; + switch (I->getOpcode()) { + case Instruction::FMul: + // Not expecting non-canonical code here. Bail out and wait. + if (match(I->getOperand(0), m_Constant())) + break; - // User must be a binary operator with one or more uses. - Instruction *User = I->user_back(); - if (!isa<BinaryOperator>(User) || User->use_empty()) - return nullptr; + if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) { + Candidates.push_back(I); + LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n'); + } + getNegatibleInsts(I->getOperand(0), Candidates); + getNegatibleInsts(I->getOperand(1), Candidates); + break; + case Instruction::FDiv: + // Not expecting non-canonical code here. Bail out and wait. + if (match(I->getOperand(0), m_Constant()) && + match(I->getOperand(1), m_Constant())) + break; - unsigned UserOpcode = User->getOpcode(); - if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub) - return nullptr; + if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) || + (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) { + Candidates.push_back(I); + LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n'); + } + getNegatibleInsts(I->getOperand(0), Candidates); + getNegatibleInsts(I->getOperand(1), Candidates); + break; + default: + break; + } +} - // Subtraction is not commutative. Explicitly, the following transform is - // not valid: (-Constant * y) - x -> x + (Constant * y) - if (!User->isCommutative() && User->getOperand(1) != I) +/// Given an fadd/fsub with an operand that is a one-use instruction +/// (the fadd/fsub), try to change negative floating-point constants into +/// positive constants to increase potential for reassociation and CSE. +Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I, + Instruction *Op, + Value *OtherOp) { + assert((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub"); + + // Collect instructions with negative FP constants from the subtree that ends + // in Op. + SmallVector<Instruction *, 4> Candidates; + getNegatibleInsts(Op, Candidates); + if (Candidates.empty()) return nullptr; // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the // resulting subtract will be broken up later. This can get us into an // infinite loop during reassociation. - if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User)) + bool IsFSub = I->getOpcode() == Instruction::FSub; + bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1; + if (NeedsSubtract && ShouldBreakUpSubtract(I)) return nullptr; - // Change the sign of the constant. - APFloat Val = CF->getValueAPF(); - Val.changeSign(); - I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val)); - - // Canonicalize I to RHS to simplify the next bit of logic. E.g., - // ((-Const*y) + x) -> (x + (-Const*y)). - if (User->getOperand(0) == I && User->isCommutative()) - cast<BinaryOperator>(User)->swapOperands(); - - Value *Op0 = User->getOperand(0); - Value *Op1 = User->getOperand(1); - BinaryOperator *NI; - switch (UserOpcode) { - default: - llvm_unreachable("Unexpected Opcode!"); - case Instruction::FAdd: - NI = BinaryOperator::CreateFSub(Op0, Op1); - NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); - break; - case Instruction::FSub: - NI = BinaryOperator::CreateFAdd(Op0, Op1); - NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); - break; + for (Instruction *Negatible : Candidates) { + const APFloat *C; + if (match(Negatible->getOperand(0), m_APFloat(C))) { + assert(!match(Negatible->getOperand(1), m_Constant()) && + "Expecting only 1 constant operand"); + assert(C->isNegative() && "Expected negative FP constant"); + Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C))); + MadeChange = true; + } + if (match(Negatible->getOperand(1), m_APFloat(C))) { + assert(!match(Negatible->getOperand(0), m_Constant()) && + "Expecting only 1 constant operand"); + assert(C->isNegative() && "Expected negative FP constant"); + Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C))); + MadeChange = true; + } } + assert(MadeChange == true && "Negative constant candidate was not changed"); - NI->insertBefore(User); - NI->setName(User->getName()); - User->replaceAllUsesWith(NI); - NI->setDebugLoc(I->getDebugLoc()); + // Negations cancelled out. + if (Candidates.size() % 2 == 0) + return I; + + // Negate the final operand in the expression by flipping the opcode of this + // fadd/fsub. + assert(Candidates.size() % 2 == 1 && "Expected odd number"); + IRBuilder<> Builder(I); + Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I) + : Builder.CreateFSubFMF(OtherOp, Op, I); + I->replaceAllUsesWith(NewInst); RedoInsts.insert(I); - MadeChange = true; - return NI; + return dyn_cast<Instruction>(NewInst); +} + +/// Canonicalize expressions that contain a negative floating-point constant +/// of the following form: +/// OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree) +/// (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree) +/// OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree) +/// +/// The fadd/fsub opcode may be switched to allow folding a negation into the +/// input instruction. +Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) { + LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n'); + Value *X; + Instruction *Op; + if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op))))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X)))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op))))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + return I; } /// Inspect and optimize the given instruction. Note that erasing @@ -2042,16 +2086,16 @@ void ReassociatePass::OptimizeInst(Instruction *I) { I = NI; } - // Canonicalize negative constants out of expressions. - if (Instruction *Res = canonicalizeNegConstExpr(I)) - I = Res; - // Commute binary operators, to canonicalize the order of their operands. // This can potentially expose more CSE opportunities, and makes writing other // transformations simpler. if (I->isCommutative()) canonicalizeOperands(I); + // Canonicalize negative constants out of expressions. + if (Instruction *Res = canonicalizeNegFPConstants(I)) + I = Res; + // Don't optimize floating-point instructions unless they are 'fast'. if (I->getType()->isFPOrFPVectorTy() && !I->isFast()) return; diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index c358258d24cf..48bbdd8d1b33 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -172,8 +172,6 @@ public: bool runOnModule(Module &M) override { bool Changed = false; - const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); for (Function &F : M) { // Nothing to do for declarations. if (F.isDeclaration() || F.empty()) @@ -186,6 +184,8 @@ public: TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); Changed |= Impl.runOnFunction(F, DT, TTI, TLI); @@ -2530,7 +2530,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // statepoints surviving this pass. This makes testing easier and the // resulting IR less confusing to human readers. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU); + bool MadeChange = removeUnreachableBlocks(F, &DTU); // Flush the Dominator Tree. DTU.getDomTree(); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 4093e50ce899..10fbdc8aacd2 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -191,7 +191,7 @@ public: /// class SCCPSolver : public InstVisitor<SCCPSolver> { const DataLayout &DL; - const TargetLibraryInfo *TLI; + std::function<const TargetLibraryInfo &(Function &)> GetTLI; SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable. DenseMap<Value *, LatticeVal> ValueState; // The state each value is in. // The state each parameter is in. @@ -268,8 +268,9 @@ public: return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy}; } - SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) - : DL(DL), TLI(tli) {} + SCCPSolver(const DataLayout &DL, + std::function<const TargetLibraryInfo &(Function &)> GetTLI) + : DL(DL), GetTLI(std::move(GetTLI)) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -1290,7 +1291,7 @@ CallOverdefined: // If we can constant fold this, mark the result of the call as a // constant. if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F, - Operands, TLI)) { + Operands, &GetTLI(*F))) { // call -> undef. if (isa<UndefValue>(C)) return; @@ -1465,7 +1466,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } LatticeVal &LV = getValueState(&I); - if (!LV.isUnknown()) continue; + if (!LV.isUnknown()) + continue; + + // There are two reasons a call can have an undef result + // 1. It could be tracked. + // 2. It could be constant-foldable. + // Because of the way we solve return values, tracked calls must + // never be marked overdefined in ResolvedUndefsIn. + if (CallSite CS = CallSite(&I)) { + if (Function *F = CS.getCalledFunction()) + if (TrackedRetVals.count(F)) + continue; + + // If the call is constant-foldable, we mark it overdefined because + // we do not know what return values are valid. + markOverdefined(&I); + return true; + } // extractvalue is safe; check here because the argument is a struct. if (isa<ExtractValueInst>(I)) @@ -1638,19 +1656,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: - // There are two reasons a call can have an undef result - // 1. It could be tracked. - // 2. It could be constant-foldable. - // Because of the way we solve return values, tracked calls must - // never be marked overdefined in ResolvedUndefsIn. - if (Function *F = CallSite(&I).getCalledFunction()) - if (TrackedRetVals.count(F)) - break; - - // If the call is constant-foldable, we mark it overdefined because - // we do not know what return values are valid. - markOverdefined(&I); - return true; + llvm_unreachable("Call-like instructions should have be handled early"); default: // If we don't know what should happen here, conservatively mark it // overdefined. @@ -1751,7 +1757,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { [](const LatticeVal &LV) { return LV.isOverdefined(); })) return false; std::vector<Constant *> ConstVals; - auto *ST = dyn_cast<StructType>(V->getType()); + auto *ST = cast<StructType>(V->getType()); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { LatticeVal V = IVs[i]; ConstVals.push_back(V.isConstant() @@ -1796,7 +1802,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { static bool runSCCP(Function &F, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - SCCPSolver Solver(DL, TLI); + SCCPSolver Solver( + DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }); // Mark the first block of the function as being executable. Solver.MarkBlockExecutable(&F.front()); @@ -1891,7 +1898,7 @@ public: return false; const DataLayout &DL = F.getParent()->getDataLayout(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); return runSCCP(F, DL, TLI); } }; @@ -1924,6 +1931,27 @@ static void findReturnsToZap(Function &F, return; } + assert( + all_of(F.users(), + [&Solver](User *U) { + if (isa<Instruction>(U) && + !Solver.isBlockExecutable(cast<Instruction>(U)->getParent())) + return true; + // Non-callsite uses are not impacted by zapping. Also, constant + // uses (like blockaddresses) could stuck around, without being + // used in the underlying IR, meaning we do not have lattice + // values for them. + if (!CallSite(U)) + return true; + if (U->getType()->isStructTy()) { + return all_of( + Solver.getStructLatticeValueFor(U), + [](const LatticeVal &LV) { return !LV.isOverdefined(); }); + } + return !Solver.getLatticeValueFor(U).isOverdefined(); + }) && + "We can only zap functions where all live users have a concrete value"); + for (BasicBlock &BB : F) { if (CallInst *CI = BB.getTerminatingMustTailCall()) { LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present " @@ -1974,9 +2002,10 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { } bool llvm::runIPSCCP( - Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI, + Module &M, const DataLayout &DL, + std::function<const TargetLibraryInfo &(Function &)> GetTLI, function_ref<AnalysisResultsForFn(Function &)> getAnalysis) { - SCCPSolver Solver(DL, TLI); + SCCPSolver Solver(DL, GetTLI); // Loop over all functions, marking arguments to those with their addresses // taken or that are external as overdefined. diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 33f90d0b01e4..74b8ff913050 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -959,14 +959,16 @@ private: std::tie(UsedI, I) = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); + Size = std::max(Size, + DL.getTypeStoreSize(LI->getType()).getFixedSize()); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { Value *Op = SI->getOperand(0); if (Op == UsedI) return SI; - Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); + Size = std::max(Size, + DL.getTypeStoreSize(Op->getType()).getFixedSize()); continue; } @@ -1197,7 +1199,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN.getParent(); - unsigned MaxAlign = 0; + MaybeAlign MaxAlign; uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType()); APInt MaxSize(APWidth, 0); bool HaveLoad = false; @@ -1218,8 +1220,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) { if (BBI->mayWriteToMemory()) return false; - uint64_t Size = DL.getTypeStoreSizeInBits(LI->getType()); - MaxAlign = std::max(MaxAlign, LI->getAlignment()); + uint64_t Size = DL.getTypeStoreSize(LI->getType()); + MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment())); MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize; HaveLoad = true; } @@ -1266,11 +1268,11 @@ static void speculatePHINodeLoads(PHINode &PN) { PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), PN.getName() + ".sroa.speculated"); - // Get the AA tags and alignment to use from one of the loads. It doesn't + // Get the AA tags and alignment to use from one of the loads. It does not // matter which one we get and if any differ. AAMDNodes AATags; SomeLoad->getAAMetadata(AATags); - unsigned Align = SomeLoad->getAlignment(); + const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment()); // Rewrite all loads of the PN to use the new PHI. while (!PN.use_empty()) { @@ -1338,11 +1340,11 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { // Both operands to the select need to be dereferenceable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. - if (!isSafeToLoadUnconditionally(TValue, LI->getType(), LI->getAlignment(), - DL, LI)) + if (!isSafeToLoadUnconditionally(TValue, LI->getType(), + MaybeAlign(LI->getAlignment()), DL, LI)) return false; - if (!isSafeToLoadUnconditionally(FValue, LI->getType(), LI->getAlignment(), - DL, LI)) + if (!isSafeToLoadUnconditionally(FValue, LI->getType(), + MaybeAlign(LI->getAlignment()), DL, LI)) return false; } @@ -1368,8 +1370,8 @@ static void speculateSelectInstLoads(SelectInst &SI) { NumLoadsSpeculated += 2; // Transfer alignment and AA info if present. - TL->setAlignment(LI->getAlignment()); - FL->setAlignment(LI->getAlignment()); + TL->setAlignment(MaybeAlign(LI->getAlignment())); + FL->setAlignment(MaybeAlign(LI->getAlignment())); AAMDNodes Tags; LI->getAAMetadata(Tags); @@ -1888,6 +1890,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { bool HaveCommonEltTy = true; auto CheckCandidateType = [&](Type *Ty) { if (auto *VTy = dyn_cast<VectorType>(Ty)) { + // Return if bitcast to vectors is different for total size in bits. + if (!CandidateTys.empty()) { + VectorType *V = CandidateTys[0]; + if (DL.getTypeSizeInBits(VTy) != DL.getTypeSizeInBits(V)) { + CandidateTys.clear(); + return; + } + } CandidateTys.push_back(VTy); if (!CommonEltTy) CommonEltTy = VTy->getElementType(); @@ -3110,7 +3120,7 @@ private: unsigned LoadAlign = LI->getAlignment(); if (!LoadAlign) LoadAlign = DL.getABITypeAlignment(LI->getType()); - LI->setAlignment(std::min(LoadAlign, getSliceAlign())); + LI->setAlignment(MaybeAlign(std::min(LoadAlign, getSliceAlign()))); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { @@ -3119,7 +3129,7 @@ private: Value *Op = SI->getOperand(0); StoreAlign = DL.getABITypeAlignment(Op->getType()); } - SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); continue; } diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 869cf00e0a89..1d2e40bf62be 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -79,6 +79,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopVersioningLICMPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); + initializeLowerConstantIntrinsicsPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); initializeLowerWidenableConditionLegacyPassPass(Registry); @@ -123,6 +124,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadCodeEliminationPass()); +} + void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBitTrackingDCEPass()); } @@ -280,6 +285,10 @@ void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBasicAAWrapperPass()); } +void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerConstantIntrinsicsPass()); +} + void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLowerExpectIntrinsicPass()); } diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index f6a12fb13142..41554fccdf08 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1121,7 +1121,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); bool Changed = false; for (BasicBlock &B : F) { for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index aeac6f548b32..ac832b9b4567 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -1909,7 +1909,7 @@ static void unswitchNontrivialInvariants( // We can only unswitch switches, conditional branches with an invariant // condition, or combining invariant conditions with an instruction. - assert((SI || BI->isConditional()) && + assert((SI || (BI && BI->isConditional())) && "Can only unswitch switches and conditional branch!"); bool FullUnswitch = SI || BI->getCondition() == Invariants[0]; if (FullUnswitch) @@ -2141,17 +2141,21 @@ static void unswitchNontrivialInvariants( buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, *ClonedPH, *LoopPH); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + + if (MSSAU) { + DT.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // Perform MSSA cloning updates. + for (auto &VMap : VMaps) + MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, + /*IgnoreIncomingWithNoClones=*/true); + MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); + } } // Apply the updates accumulated above to get an up-to-date dominator tree. DT.applyUpdates(DTUpdates); - if (!FullUnswitch && MSSAU) { - // Update MSSA for partial unswitch, after DT update. - SmallVector<CFGUpdate, 1> Updates; - Updates.push_back( - {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second}); - MSSAU->applyInsertUpdates(Updates, DT); - } // Now that we have an accurate dominator tree, first delete the dead cloned // blocks so that we can accurately build any cloned loops. It is important to @@ -2720,7 +2724,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, return Cost * (SuccessorsCount - 1); }; Instruction *BestUnswitchTI = nullptr; - int BestUnswitchCost; + int BestUnswitchCost = 0; ArrayRef<Value *> BestUnswitchInvariants; for (auto &TerminatorAndInvariants : UnswitchCandidates) { Instruction &TI = *TerminatorAndInvariants.first; @@ -2752,6 +2756,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, BestUnswitchInvariants = Invariants; } } + assert(BestUnswitchTI && "Failed to find loop unswitch candidate"); if (BestUnswitchCost >= UnswitchThreshold) { LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " @@ -2880,7 +2885,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp index c13fb3e04516..e6db11f47ead 100644 --- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp +++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -777,8 +777,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs, // speculation if the predecessor is an invoke. This doesn't seem // fundamental and we should probably be splitting critical edges // differently. - if (isa<IndirectBrInst>(PredBB->getTerminator()) || - isa<InvokeInst>(PredBB->getTerminator())) { + const auto *TermInst = PredBB->getTerminator(); + if (isa<IndirectBrInst>(TermInst) || + isa<InvokeInst>(TermInst) || + isa<CallBrInst>(TermInst)) { LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName() << "\n"); return false; diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index e5400676c7e8..9791cf41f621 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -65,7 +65,7 @@ static cl::opt<bool> ForceSkipUniformRegions( static cl::opt<bool> RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden, cl::desc("Allow relaxed uniform region checks"), - cl::init(false)); + cl::init(true)); // Definition of the complex types used in this pass. diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index f0b79079d817..b27a36b67d62 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -341,7 +341,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { const DataLayout &DL = L->getModule()->getDataLayout(); if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(), - L->getAlignment(), DL, L)) + MaybeAlign(L->getAlignment()), DL, L)) return false; } } |