diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
commit | 67c32a98315f785a9ec9d531c1f571a0196c7463 (patch) | |
tree | 4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Transforms/Scalar | |
parent | 9f61947910e6ab40de38e6b4034751ef1513200f (diff) |
Diffstat (limited to 'lib/Transforms/Scalar')
36 files changed, 3531 insertions, 1794 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 1a3a4aadce6a..3d9198469bc5 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -73,7 +73,7 @@ bool ADCE::runOnFunction(Function& F) { for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); OI != OE; ++OI) if (Instruction* Inst = dyn_cast<Instruction>(OI)) - if (alive.insert(Inst)) + if (alive.insert(Inst).second) worklist.push_back(Inst); } diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp new file mode 100644 index 000000000000..f48cefaa4fba --- /dev/null +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -0,0 +1,428 @@ +//===----------------------- AlignmentFromAssumptions.cpp -----------------===// +// Set Load/Store Alignments From Assumptions +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a ScalarEvolution-based transformation to set +// the alignments of load, stores and memory intrinsics based on the truth +// expressions of assume intrinsics. The primary motivation is to handle +// complex alignment assumptions that apply to vector loads and stores that +// appear after vectorization and unrolling. +// +//===----------------------------------------------------------------------===// + +#define AA_NAME "alignment-from-assumptions" +#define DEBUG_TYPE AA_NAME +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumLoadAlignChanged, + "Number of loads changed by alignment assumptions"); +STATISTIC(NumStoreAlignChanged, + "Number of stores changed by alignment assumptions"); +STATISTIC(NumMemIntAlignChanged, + "Number of memory intrinsics changed by alignment assumptions"); + +namespace { +struct AlignmentFromAssumptions : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + AlignmentFromAssumptions() : FunctionPass(ID) { + initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<DominatorTreeWrapperPass>(); + + AU.setPreservesCFG(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<ScalarEvolution>(); + } + + // For memory transfers, we need a common alignment for both the source and + // destination. If we have a new alignment for only one operand of a transfer + // instruction, save it in these maps. If we reach the other operand through + // another assumption later, then we may change the alignment at that point. + DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments; + + ScalarEvolution *SE; + DominatorTree *DT; + const DataLayout *DL; + + bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV, + const SCEV *&OffSCEV); + bool processAssumption(CallInst *I); +}; +} + +char AlignmentFromAssumptions::ID = 0; +static const char aip_name[] = "Alignment from assumptions"; +INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, + aip_name, false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, + aip_name, false, false) + +FunctionPass *llvm::createAlignmentFromAssumptionsPass() { + return new AlignmentFromAssumptions(); +} + +// Given an expression for the (constant) alignment, AlignSCEV, and an +// expression for the displacement between a pointer and the aligned address, +// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced +// to a constant. Using SCEV to compute alignment handles the case where +// DiffSCEV is a recurrence with constant start such that the aligned offset +// is constant. e.g. {16,+,32} % 32 -> 16. +static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, + const SCEV *AlignSCEV, + ScalarEvolution *SE) { + // DiffUnits = Diff % int64_t(Alignment) + const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV); + const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV); + const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV); + + DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " << + *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); + + if (const SCEVConstant *ConstDUSCEV = + dyn_cast<SCEVConstant>(DiffUnitsSCEV)) { + int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue(); + + // If the displacement is an exact multiple of the alignment, then the + // displaced pointer has the same alignment as the aligned pointer, so + // return the alignment value. + if (!DiffUnits) + return (unsigned) + cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue(); + + // If the displacement is not an exact multiple, but the remainder is a + // constant, then return this remainder (but only if it is a power of 2). + uint64_t DiffUnitsAbs = abs64(DiffUnits); + if (isPowerOf2_64(DiffUnitsAbs)) + return (unsigned) DiffUnitsAbs; + } + + return 0; +} + +// There is an address given by an offset OffSCEV from AASCEV which has an +// alignment AlignSCEV. Use that information, if possible, to compute a new +// alignment for Ptr. +static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, + const SCEV *OffSCEV, Value *Ptr, + ScalarEvolution *SE) { + const SCEV *PtrSCEV = SE->getSCEV(Ptr); + const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV); + + // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always + // sign-extended OffSCEV to i64, so make sure they agree again. + DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType()); + + // What we really want to know is the overall offset to the aligned + // address. This address is displaced by the provided offset. + DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV); + + DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " << + *AlignSCEV << " and offset " << *OffSCEV << + " using diff " << *DiffSCEV << "\n"); + + unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE); + DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n"); + + if (NewAlignment) { + return NewAlignment; + } else if (const SCEVAddRecExpr *DiffARSCEV = + dyn_cast<SCEVAddRecExpr>(DiffSCEV)) { + // The relative offset to the alignment assumption did not yield a constant, + // but we should try harder: if we assume that a is 32-byte aligned, then in + // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are + // 32-byte aligned, but instead alternate between 32 and 16-byte alignment. + // As a result, the new alignment will not be a constant, but can still + // be improved over the default (of 4) to 16. + + const SCEV *DiffStartSCEV = DiffARSCEV->getStart(); + const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE); + + DEBUG(dbgs() << "\ttrying start/inc alignment using start " << + *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n"); + + // Now compute the new alignment using the displacement to the value in the + // first iteration, and also the alignment using the per-iteration delta. + // If these are the same, then use that answer. Otherwise, use the smaller + // one, but only if it divides the larger one. + NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE); + unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE); + + DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n"); + DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n"); + + if (!NewAlignment || !NewIncAlignment) { + return 0; + } else if (NewAlignment > NewIncAlignment) { + if (NewAlignment % NewIncAlignment == 0) { + DEBUG(dbgs() << "\tnew start/inc alignment: " << + NewIncAlignment << "\n"); + return NewIncAlignment; + } + } else if (NewIncAlignment > NewAlignment) { + if (NewIncAlignment % NewAlignment == 0) { + DEBUG(dbgs() << "\tnew start/inc alignment: " << + NewAlignment << "\n"); + return NewAlignment; + } + } else if (NewIncAlignment == NewAlignment) { + DEBUG(dbgs() << "\tnew start/inc alignment: " << + NewAlignment << "\n"); + return NewAlignment; + } + } + + return 0; +} + +bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, + Value *&AAPtr, const SCEV *&AlignSCEV, + const SCEV *&OffSCEV) { + // An alignment assume must be a statement about the least-significant + // bits of the pointer being zero, possibly with some offset. + ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0)); + if (!ICI) + return false; + + // This must be an expression of the form: x & m == 0. + if (ICI->getPredicate() != ICmpInst::ICMP_EQ) + return false; + + // Swap things around so that the RHS is 0. + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS); + const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS); + if (CmpLHSSCEV->isZero()) + std::swap(CmpLHS, CmpRHS); + else if (!CmpRHSSCEV->isZero()) + return false; + + BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS); + if (!CmpBO || CmpBO->getOpcode() != Instruction::And) + return false; + + // Swap things around so that the right operand of the and is a constant + // (the mask); we cannot deal with variable masks. + Value *AndLHS = CmpBO->getOperand(0); + Value *AndRHS = CmpBO->getOperand(1); + const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS); + const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS); + if (isa<SCEVConstant>(AndLHSSCEV)) { + std::swap(AndLHS, AndRHS); + std::swap(AndLHSSCEV, AndRHSSCEV); + } + + const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV); + if (!MaskSCEV) + return false; + + // The mask must have some trailing ones (otherwise the condition is + // trivial and tells us nothing about the alignment of the left operand). + unsigned TrailingOnes = + MaskSCEV->getValue()->getValue().countTrailingOnes(); + if (!TrailingOnes) + return false; + + // Cap the alignment at the maximum with which LLVM can deal (and make sure + // we don't overflow the shift). + uint64_t Alignment; + TrailingOnes = std::min(TrailingOnes, + unsigned(sizeof(unsigned) * CHAR_BIT - 1)); + Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment); + + Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext()); + AlignSCEV = SE->getConstant(Int64Ty, Alignment); + + // The LHS might be a ptrtoint instruction, or it might be the pointer + // with an offset. + AAPtr = nullptr; + OffSCEV = nullptr; + if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) { + AAPtr = PToI->getPointerOperand(); + OffSCEV = SE->getConstant(Int64Ty, 0); + } else if (const SCEVAddExpr* AndLHSAddSCEV = + dyn_cast<SCEVAddExpr>(AndLHSSCEV)) { + // Try to find the ptrtoint; subtract it and the rest is the offset. + for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(), + JE = AndLHSAddSCEV->op_end(); J != JE; ++J) + if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J)) + if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) { + AAPtr = PToI->getPointerOperand(); + OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J); + break; + } + } + + if (!AAPtr) + return false; + + // Sign extend the offset to 64 bits (so that it is like all of the other + // expressions). + unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits(); + if (OffSCEVBits < 64) + OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty); + else if (OffSCEVBits > 64) + return false; + + AAPtr = AAPtr->stripPointerCasts(); + return true; +} + +bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { + Value *AAPtr; + const SCEV *AlignSCEV, *OffSCEV; + if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV)) + return false; + + const SCEV *AASCEV = SE->getSCEV(AAPtr); + + // Apply the assumption to all other users of the specified pointer. + SmallPtrSet<Instruction *, 32> Visited; + SmallVector<Instruction*, 16> WorkList; + for (User *J : AAPtr->users()) { + if (J == ACall) + continue; + + if (Instruction *K = dyn_cast<Instruction>(J)) + if (isValidAssumeForContext(ACall, K, DL, DT)) + WorkList.push_back(K); + } + + while (!WorkList.empty()) { + Instruction *J = WorkList.pop_back_val(); + + if (LoadInst *LI = dyn_cast<LoadInst>(J)) { + unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, + LI->getPointerOperand(), SE); + + if (NewAlignment > LI->getAlignment()) { + LI->setAlignment(NewAlignment); + ++NumLoadAlignChanged; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) { + unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, + SI->getPointerOperand(), SE); + + if (NewAlignment > SI->getAlignment()) { + SI->setAlignment(NewAlignment); + ++NumStoreAlignChanged; + } + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) { + unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, + MI->getDest(), SE); + + // For memory transfers, we need a common alignment for both the + // source and destination. If we have a new alignment for this + // instruction, but only for one operand, save it. If we reach the + // other operand through another assumption later, then we may + // change the alignment at that point. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { + unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV, + MTI->getSource(), SE); + + DenseMap<MemTransferInst *, unsigned>::iterator DI = + NewDestAlignments.find(MTI); + unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ? + 0 : DI->second; + + DenseMap<MemTransferInst *, unsigned>::iterator SI = + NewSrcAlignments.find(MTI); + unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ? + 0 : SI->second; + + DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " << + AltDestAlignment << " " << NewSrcAlignment << + " " << AltSrcAlignment << "\n"); + + // Of these four alignments, pick the largest possible... + unsigned NewAlignment = 0; + if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment)) + NewAlignment = std::max(NewAlignment, NewDestAlignment); + if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment)) + NewAlignment = std::max(NewAlignment, AltDestAlignment); + if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment)) + NewAlignment = std::max(NewAlignment, NewSrcAlignment); + if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment)) + NewAlignment = std::max(NewAlignment, AltSrcAlignment); + + if (NewAlignment > MI->getAlignment()) { + MI->setAlignment(ConstantInt::get(Type::getInt32Ty( + MI->getParent()->getContext()), NewAlignment)); + ++NumMemIntAlignChanged; + } + + NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment)); + NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment)); + } else if (NewDestAlignment > MI->getAlignment()) { + assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) && + "Unknown memory intrinsic"); + + MI->setAlignment(ConstantInt::get(Type::getInt32Ty( + MI->getParent()->getContext()), NewDestAlignment)); + ++NumMemIntAlignChanged; + } + } + + // Now that we've updated that use of the pointer, look for other uses of + // the pointer to update. + Visited.insert(J); + for (User *UJ : J->users()) { + Instruction *K = cast<Instruction>(UJ); + if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT)) + WorkList.push_back(K); + } + } + + return true; +} + +bool AlignmentFromAssumptions::runOnFunction(Function &F) { + bool Changed = false; + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + SE = &getAnalysis<ScalarEvolution>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + + NewDestAlignments.clear(); + NewSrcAlignments.clear(); + + for (auto &AssumeVH : AC.assumptions()) + if (AssumeVH) + Changed |= processAssumption(cast<CallInst>(AssumeVH)); + + return Changed; +} + diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 261ddda30150..b3ee11ed67cd 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp + AlignmentFromAssumptions.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 763d02b9fcd6..27c177a542e3 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -91,7 +91,7 @@ struct RebasedConstantInfo { Constant *Offset; RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset) - : Uses(Uses), Offset(Offset) { } + : Uses(std::move(Uses)), Offset(Offset) { } }; /// \brief A base constant and all its rebased constants. @@ -395,7 +395,7 @@ void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S, ConstInfo.RebasedConstants.push_back( RebasedConstantInfo(std::move(ConstCand->Uses), Offset)); } - ConstantVec.push_back(ConstInfo); + ConstantVec.push_back(std::move(ConstInfo)); } /// \brief Finds and combines constant candidates that can be easily diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 082946229b35..5a3b5cf34cc3 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -73,7 +73,7 @@ bool CorrelatedValuePropagation::processSelect(SelectInst *S) { if (S->getType()->isVectorTy()) return false; if (isa<Constant>(S->getOperand(0))) return false; - Constant *C = LVI->getConstant(S->getOperand(0), S->getParent()); + Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S); if (!C) return false; ConstantInt *CI = dyn_cast<ConstantInt>(C); @@ -100,7 +100,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { Value *Incoming = P->getIncomingValue(i); if (isa<Constant>(Incoming)) continue; - Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB); + Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P); // Look if the incoming value is a select with a constant but LVI tells us // that the incoming value can never be that constant. In that case replace @@ -114,7 +114,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { if (!C) continue; if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, - P->getIncomingBlock(i), BB) != + P->getIncomingBlock(i), BB, P) != LazyValueInfo::False) continue; @@ -126,6 +126,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { Changed = true; } + // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction. if (Value *V = SimplifyInstruction(P)) { P->replaceAllUsesWith(V); P->eraseFromParent(); @@ -147,7 +148,7 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { if (isa<Constant>(Pointer)) return false; - Constant *C = LVI->getConstant(Pointer, I->getParent()); + Constant *C = LVI->getConstant(Pointer, I->getParent(), I); if (!C) return false; ++NumMemAccess; @@ -173,13 +174,15 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) { if (PI == PE) return false; LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, C->getParent()); + C->getOperand(0), Op1, *PI, + C->getParent(), C); if (Result == LazyValueInfo::Unknown) return false; ++PI; while (PI != PE) { LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, C->getParent()); + C->getOperand(0), Op1, *PI, + C->getParent(), C); if (Res != Result) return false; ++PI; } @@ -229,7 +232,8 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { for (pred_iterator PI = PB; PI != PE; ++PI) { // Is the switch condition equal to the case value? LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ, - Cond, Case, *PI, BB); + Cond, Case, *PI, + BB, SI); // Give up on this case if nothing is known. if (Value == LazyValueInfo::Unknown) { State = LazyValueInfo::Unknown; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 3af8ee7546fb..a1ddc00da5ba 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -356,15 +356,8 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // If we don't know the sizes of either access, then we can't do a // comparison. if (Later.Size == AliasAnalysis::UnknownSize || - Earlier.Size == AliasAnalysis::UnknownSize) { - // If we have no DataLayout information around, then the size of the store - // is inferrable from the pointee type. If they are the same type, then - // we know that the store is safe. - if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType()) - return OverwriteComplete; - + Earlier.Size == AliasAnalysis::UnknownSize) return OverwriteUnknown; - } // Make sure that the Later size is >= the Earlier size. if (Later.Size >= Earlier.Size) diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 735f5c194cb5..394b0d3de7bd 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,17 +16,21 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include <vector> +#include <deque> using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "early-cse" @@ -266,6 +270,7 @@ public: const DataLayout *DL; const TargetLibraryInfo *TLI; DominatorTree *DT; + AssumptionCache *AC; typedef RecyclingAllocator<BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, @@ -378,6 +383,7 @@ private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); AU.setPreservesCFG(); @@ -393,6 +399,7 @@ FunctionPass *llvm::createEarlyCSEPass() { } INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) @@ -431,9 +438,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // Skip assume intrinsics, they don't really have side effects (although + // they're marked as such to ensure preservation of control dependencies), + // and this pass will not disturb any of the assumption's control + // dependencies. + if (match(Inst, m_Intrinsic<Intrinsic::assume>())) { + DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n'); + continue; + } + // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) { + if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -530,7 +546,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { Changed = true; ++NumDSE; LastStore = nullptr; - continue; + // fallthrough - we can exploit information about this store } // Okay, we just invalidated anything we knew about loaded values. Try @@ -556,12 +572,17 @@ bool EarlyCSE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - std::vector<StackNode *> nodesToProcess; + // Note, deque is being used here because there is significant performance gains + // over vector when the container becomes very large due to the specific access + // patterns. For more information see the mailing list discussion on this: + // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html + std::deque<StackNode *> nodesToProcess; DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); // Tables that the pass uses when walking the domtree. ScopedHTType AVTable; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 106eba099ca0..b814b2525dca 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -20,10 +20,12 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -45,6 +47,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> using namespace llvm; @@ -590,6 +593,7 @@ namespace { DominatorTree *DT; const DataLayout *DL; const TargetLibraryInfo *TLI; + AssumptionCache *AC; SetVector<BasicBlock *> DeadBlocks; ValueTable VN; @@ -679,6 +683,7 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfo>(); if (!NoLoads) @@ -705,6 +710,7 @@ namespace { void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); bool performPRE(Function &F); + bool performScalarPRE(Instruction *I); Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; @@ -727,6 +733,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) { } INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) @@ -1616,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), DL); + PHITransAddr Address(LI->getPointerOperand(), DL, AC); Value *LoadPtr = nullptr; LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, NewInsts); @@ -1669,9 +1676,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, LI->getAlignment(), UnavailablePred->getTerminator()); - // Transfer the old load's TBAA tag to the new load. - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) - NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + // Transfer the old load's AA tags to the new load. + AAMDNodes Tags; + LI->getAAMetadata(Tags); + if (Tags) + NewLoad->setAAMetadata(Tags); // Transfer DebugLoc. NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1700,8 +1709,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, bool GVN::processNonLocalLoad(LoadInst *LI) { // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; - AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); - MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + MD->getNonLocalPointerDependency(LI, Deps); // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing @@ -1722,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } + // If this load follows a GEP, see if we can PRE the indices before analyzing. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) { + for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), + OE = GEP->idx_end(); + OI != OE; ++OI) + if (Instruction *I = dyn_cast<Instruction>(OI->get())) + performScalarPRE(I); + } + // Step 2: Analyze the availability of the load AvailValInBlkVect ValuesPerBlock; UnavailBlkVect UnavailableBlocks; @@ -1774,36 +1791,24 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { ReplOp->setHasNoUnsignedWrap(false); } if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { - SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; - ReplInst->getAllMetadataOtherThanDebugLoc(Metadata); - for (int i = 0, n = Metadata.size(); i < n; ++i) { - unsigned Kind = Metadata[i].first; - MDNode *IMD = I->getMetadata(Kind); - MDNode *ReplMD = Metadata[i].second; - switch(Kind) { - default: - ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata - break; - case LLVMContext::MD_dbg: - llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); - case LLVMContext::MD_tbaa: - ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD)); - break; - case LLVMContext::MD_range: - ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); - break; - case LLVMContext::MD_prof: - llvm_unreachable("MD_prof in a non-terminator instruction"); - break; - case LLVMContext::MD_fpmath: - ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); - break; - case LLVMContext::MD_invariant_load: - // Only set the !invariant.load if it is present in both instructions. - ReplInst->setMetadata(Kind, IMD); - break; - } - } + // FIXME: If both the original and replacement value are part of the + // same control-flow region (meaning that the execution of one + // guarentees the executation of the other), then we can combine the + // noalias scopes here and do better than the general conservative + // answer used in combineMetadata(). + + // In general, GVN unifies expressions over different control-flow + // regions, and so we need a conservative combination of the noalias + // scopes. + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_range, + LLVMContext::MD_fpmath, + LLVMContext::MD_invariant_load, + }; + combineMetadata(ReplInst, I, KnownIDs); } } @@ -2101,15 +2106,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::swap(LHS, RHS); assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); - // If there is no obvious reason to prefer the left-hand side over the right- - // hand side, ensure the longest lived term is on the right-hand side, so the - // shortest lived term will be replaced by the longest lived. This tends to - // expose more simplifications. + // If there is no obvious reason to prefer the left-hand side over the + // right-hand side, ensure the longest lived term is on the right-hand side, + // so the shortest lived term will be replaced by the longest lived. + // This tends to expose more simplifications. uint32_t LVN = VN.lookup_or_add(LHS); if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { - // Move the 'oldest' value to the right-hand side, using the value number as - // a proxy for age. + // Move the 'oldest' value to the right-hand side, using the value number + // as a proxy for age. uint32_t RVN = VN.lookup_or_add(RHS); if (LVN < RVN) { std::swap(LHS, RHS); @@ -2138,10 +2143,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, NumGVNEqProp += NumReplacements; } - // Now try to deduce additional equalities from this one. For example, if the - // known equality was "(A != B)" == "false" then it follows that A and B are - // equal in the scope. Only boolean equalities with an explicit true or false - // RHS are currently supported. + // Now try to deduce additional equalities from this one. For example, if + // the known equality was "(A != B)" == "false" then it follows that A and B + // are equal in the scope. Only boolean equalities with an explicit true or + // false RHS are currently supported. if (!RHS->getType()->isIntegerTy(1)) // Not a boolean equality - bail out. continue; @@ -2166,7 +2171,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // If we are propagating an equality like "(A == B)" == "true" then also // propagate the equality A == B. When propagating a comparison such as // "(A >= B)" == "true", replace all instances of "A < B" with "false". - if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) { Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace @@ -2175,12 +2180,17 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) Worklist.push_back(std::make_pair(Op0, Op1)); + // Handle the floating point versions of equality comparisons too. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) + Worklist.push_back(std::make_pair(Op0, Op1)); + // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); - // Since we don't have the instruction "A < B" immediately to hand, work out - // the value number that it would have and use that to find an appropriate - // instruction (if any). + // Since we don't have the instruction "A < B" immediately to hand, work + // out the value number that it would have and use that to find an + // appropriate instruction (if any). uint32_t NextNum = VN.getNextUnusedValueNumber(); uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); // If the number we were assigned was brand new then there is no point in @@ -2219,7 +2229,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. - if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) { + if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); @@ -2339,6 +2349,7 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2435,175 +2446,182 @@ bool GVN::processBlock(BasicBlock *BB) { return ChangedFunction; } -/// performPRE - Perform a purely local form of PRE that looks for diamond -/// control flow patterns and attempts to perform simple PRE at the join point. -bool GVN::performPRE(Function &F) { - bool Changed = false; +bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { - // Nothing to PRE in the entry block. - if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) continue; + if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) || + isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + isa<DbgInfoIntrinsic>(CurInst)) + return false; - for (BasicBlock::iterator BI = CurrentBlock->begin(), - BE = CurrentBlock->end(); BI != BE; ) { - Instruction *CurInst = BI++; + // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from + // sinking the compare again, and it would force the code generator to + // move the i1 from processor flags or predicate registers into a general + // purpose register. + if (isa<CmpInst>(CurInst)) + return false; - if (isa<AllocaInst>(CurInst) || - isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || - CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || - isa<DbgInfoIntrinsic>(CurInst)) - continue; + // We don't currently value number ANY inline asm calls. + if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) + if (CallI->isInlineAsm()) + return false; - // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from - // sinking the compare again, and it would force the code generator to - // move the i1 from processor flags or predicate registers into a general - // purpose register. - if (isa<CmpInst>(CurInst)) - continue; + uint32_t ValNo = VN.lookup(CurInst); + + // Look for the predecessors for PRE opportunities. We're + // only trying to solve the basic diamond case, where + // a value is computed in the successor and one predecessor, + // but not the other. We also explicitly disallow cases + // where the successor is its own predecessor, because they're + // more complicated to get right. + unsigned NumWith = 0; + unsigned NumWithout = 0; + BasicBlock *PREPred = nullptr; + BasicBlock *CurrentBlock = CurInst->getParent(); + predMap.clear(); + + for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); + PI != PE; ++PI) { + BasicBlock *P = *PI; + // We're not interested in PRE where the block is its + // own predecessor, or in blocks with predecessors + // that are not reachable. + if (P == CurrentBlock) { + NumWithout = 2; + break; + } else if (!DT->isReachableFromEntry(P)) { + NumWithout = 2; + break; + } - // We don't currently value number ANY inline asm calls. - if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) - if (CallI->isInlineAsm()) - continue; + Value *predV = findLeader(P, ValNo); + if (!predV) { + predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); + PREPred = P; + ++NumWithout; + } else if (predV == CurInst) { + /* CurInst dominates this predecessor. */ + NumWithout = 2; + break; + } else { + predMap.push_back(std::make_pair(predV, P)); + ++NumWith; + } + } - uint32_t ValNo = VN.lookup(CurInst); - - // Look for the predecessors for PRE opportunities. We're - // only trying to solve the basic diamond case, where - // a value is computed in the successor and one predecessor, - // but not the other. We also explicitly disallow cases - // where the successor is its own predecessor, because they're - // more complicated to get right. - unsigned NumWith = 0; - unsigned NumWithout = 0; - BasicBlock *PREPred = nullptr; - predMap.clear(); - - for (pred_iterator PI = pred_begin(CurrentBlock), - PE = pred_end(CurrentBlock); PI != PE; ++PI) { - BasicBlock *P = *PI; - // We're not interested in PRE where the block is its - // own predecessor, or in blocks with predecessors - // that are not reachable. - if (P == CurrentBlock) { - NumWithout = 2; - break; - } else if (!DT->isReachableFromEntry(P)) { - NumWithout = 2; - break; - } + // Don't do PRE when it might increase code size, i.e. when + // we would need to insert instructions in more than one pred. + if (NumWithout != 1 || NumWith == 0) + return false; - Value* predV = findLeader(P, ValNo); - if (!predV) { - predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); - PREPred = P; - ++NumWithout; - } else if (predV == CurInst) { - /* CurInst dominates this predecessor. */ - NumWithout = 2; - break; - } else { - predMap.push_back(std::make_pair(predV, P)); - ++NumWith; - } - } + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + return false; - // Don't do PRE when it might increase code size, i.e. when - // we would need to insert instructions in more than one pred. - if (NumWithout != 1 || NumWith == 0) - continue; + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + return false; + } - // Don't do PRE across indirect branch. - if (isa<IndirectBrInst>(PREPred->getTerminator())) - continue; + // Instantiate the expression in the predecessor that lacked it. + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + Instruction *PREInstr = CurInst->clone(); + bool success = true; + for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { + Value *Op = PREInstr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; - // We can't do PRE safely on a critical edge, so instead we schedule - // the edge to be split and perform the PRE the next time we iterate - // on the function. - unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { - toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); - continue; - } + if (Value *V = findLeader(PREPred, VN.lookup(Op))) { + PREInstr->setOperand(i, V); + } else { + success = false; + break; + } + } - // Instantiate the expression in the predecessor that lacked it. - // Because we are going top-down through the block, all value numbers - // will be available in the predecessor by the time we need them. Any - // that weren't originally present will have been instantiated earlier - // in this loop. - Instruction *PREInstr = CurInst->clone(); - bool success = true; - for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { - Value *Op = PREInstr->getOperand(i); - if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) - continue; + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) { + DEBUG(verifyRemoved(PREInstr)); + delete PREInstr; + return false; + } - if (Value *V = findLeader(PREPred, VN.lookup(Op))) { - PREInstr->setOperand(i, V); - } else { - success = false; - break; - } - } + PREInstr->insertBefore(PREPred->getTerminator()); + PREInstr->setName(CurInst->getName() + ".pre"); + PREInstr->setDebugLoc(CurInst->getDebugLoc()); + VN.add(PREInstr, ValNo); + ++NumGVNPRE; - // Fail out if we encounter an operand that is not available in - // the PRE predecessor. This is typically because of loads which - // are not value numbered precisely. - if (!success) { - DEBUG(verifyRemoved(PREInstr)); - delete PREInstr; - continue; - } + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, PREInstr, PREPred); - PREInstr->insertBefore(PREPred->getTerminator()); - PREInstr->setName(CurInst->getName() + ".pre"); - PREInstr->setDebugLoc(CurInst->getDebugLoc()); - VN.add(PREInstr, ValNo); - ++NumGVNPRE; - - // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, PREInstr, PREPred); - - // Create a PHI to make the value available in this block. - PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", - CurrentBlock->begin()); - for (unsigned i = 0, e = predMap.size(); i != e; ++i) { - if (Value *V = predMap[i].first) - Phi->addIncoming(V, predMap[i].second); - else - Phi->addIncoming(PREInstr, PREPred); - } + // Create a PHI to make the value available in this block. + PHINode *Phi = + PHINode::Create(CurInst->getType(), predMap.size(), + CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + for (unsigned i = 0, e = predMap.size(); i != e; ++i) { + if (Value *V = predMap[i].first) + Phi->addIncoming(V, predMap[i].second); + else + Phi->addIncoming(PREInstr, PREPred); + } - VN.add(Phi, ValNo); - addToLeaderTable(ValNo, Phi, CurrentBlock); - Phi->setDebugLoc(CurInst->getDebugLoc()); - CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; - ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } + VN.add(Phi, ValNo); + addToLeaderTable(ValNo, Phi, CurrentBlock); + Phi->setDebugLoc(CurInst->getDebugLoc()); + CurInst->replaceAllUsesWith(Phi); + if (Phi->getType()->getScalarType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { + unsigned jj = PHINode::getOperandNumForIncomingValue(ii); + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); + } - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } - VN.erase(CurInst); - removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } + VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + + DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + if (MD) + MD->removeInstruction(CurInst); + DEBUG(verifyRemoved(CurInst)); + CurInst->eraseFromParent(); + return true; +} + +/// performPRE - Perform a purely local form of PRE that looks for diamond +/// control flow patterns and attempts to perform simple PRE at the join point. +bool GVN::performPRE(Function &F) { + bool Changed = false; + for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) { + // Nothing to PRE in the entry block. + if (CurrentBlock == &F.getEntryBlock()) + continue; + + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) + continue; - DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); - if (MD) MD->removeInstruction(CurInst); - DEBUG(verifyRemoved(CurInst)); - CurInst->eraseFromParent(); - Changed = true; + for (BasicBlock::iterator BI = CurrentBlock->begin(), + BE = CurrentBlock->end(); + BI != BE;) { + Instruction *CurInst = BI++; + Changed = performScalarPRE(CurInst); } } @@ -2641,25 +2659,21 @@ bool GVN::iterateOnFunction(Function &F) { // Top-down walk of the dominator tree bool Changed = false; -#if 0 - // Needed for value numbering with phi construction to work. - ReversePostOrderTraversal<Function*> RPOT(&F); - for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(), - RE = RPOT.end(); RI != RE; ++RI) - Changed |= processBlock(*RI); -#else // Save the blocks this function have before transformation begins. GVN may // split critical edge, and hence may invalidate the RPO/DT iterator. // std::vector<BasicBlock *> BBVect; BBVect.reserve(256); - for (DomTreeNode *x : depth_first(DT->getRootNode())) - BBVect.push_back(x->getBlock()); + // Needed for value numbering with phi construction to work. + ReversePostOrderTraversal<Function *> RPOT(&F); + for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(), + RE = RPOT.end(); + RI != RE; ++RI) + BBVect.push_back(*RI); for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); I != E; I++) Changed |= processBlock(*I); -#endif return Changed; } @@ -2802,7 +2816,7 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { return true; } -// performPRE() will trigger assert if it come across an instruciton without +// performPRE() will trigger assert if it comes across an instruction without // associated val-num. As it normally has far more live instructions than dead // instructions, it makes more sense just to "fabricate" a val-number for the // dead code than checking if instruction involved is dead or not. diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 9cf0ca0912f9..c01f57f26ea9 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,6 +31,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -69,11 +70,12 @@ static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden, namespace { class IndVarSimplify : public LoopPass { - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - const DataLayout *DL; - TargetLibraryInfo *TLI; + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + const DataLayout *DL; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; SmallVector<WeakVH, 16> DeadInsts; bool Changed; @@ -650,7 +652,7 @@ namespace { struct WideIVInfo { PHINode *NarrowIV; Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was an sext user seen before a zext? + bool IsSigned; // Was a sext user seen before a zext? WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), IsSigned(false) {} @@ -661,7 +663,7 @@ namespace { /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, - const DataLayout *DL) { + const DataLayout *DL, const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) return; @@ -671,6 +673,19 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, if (DL && !DL->isLegalInteger(Width)) return; + // Cast is either an sext or zext up to this point. + // We should not widen an indvar if arithmetics on the wider indvar are more + // expensive than those on the narrower indvar. We check only the cost of ADD + // because at least an ADD is required to increment the induction variable. We + // could compute more comprehensively the cost of all instructions on the + // induction variable when necessary. + if (TTI && + TTI->getArithmeticInstrCost(Instruction::Add, Ty) > + TTI->getArithmeticInstrCost(Instruction::Add, + Cast->getOperand(0)->getType())) { + return; + } + if (!WI.WidestNativeType) { WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); WI.IsSigned = IsSigned; @@ -757,8 +772,13 @@ protected: const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, + unsigned OpCode) const; + Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + bool WidenLoopCompare(NarrowIVDefUse DU); + void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace @@ -833,18 +853,35 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { } } +const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, + unsigned OpCode) const { + if (OpCode == Instruction::Add) + return SE->getAddExpr(LHS, RHS); + if (OpCode == Instruction::Sub) + return SE->getMinusSCEV(LHS, RHS); + if (OpCode == Instruction::Mul) + return SE->getMulExpr(LHS, RHS); + + llvm_unreachable("Unsupported opcode."); +} + /// No-wrap operations can transfer sign extension of their result to their /// operands. Generate the SCEV value for the widened operation without /// actually modifying the IR yet. If the expression after extending the /// operands is an AddRec for this loop, return it. const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { + // Handle the common case of add<nsw/nuw> - if (DU.NarrowUse->getOpcode() != Instruction::Add) + const unsigned OpCode = DU.NarrowUse->getOpcode(); + // Only Add/Sub/Mul instructions supported yet. + if (OpCode != Instruction::Add && OpCode != Instruction::Sub && + OpCode != Instruction::Mul) return nullptr; // One operand (NarrowDef) has already been extended to WideDef. Now determine // if extending the other will lead to a recurrence. - unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; + const unsigned ExtendOperIdx = + DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); const SCEV *ExtendOperExpr = nullptr; @@ -859,13 +896,20 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { else return nullptr; - // When creating this AddExpr, don't apply the current operations NSW or NUW + // When creating this SCEV expr, don't apply the current operations NSW or NUW // flags. This instruction may be guarded by control flow that the no-wrap // behavior depends on. Non-control-equivalent instructions can be mapped to // the same SCEV expression, and it would be incorrect to transfer NSW/NUW // semantics to those operations. - const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>( - SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr)); + const SCEV *lhs = SE->getSCEV(DU.WideDef); + const SCEV *rhs = ExtendOperExpr; + + // Let's swap operands to the initial order for the case of non-commutative + // operations, like SUB. See PR21014. + if (ExtendOperIdx == 0) + std::swap(lhs, rhs); + const SCEVAddRecExpr *AddRec = + dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode)); if (!AddRec || AddRec->getLoop() != L) return nullptr; @@ -908,6 +952,35 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } +/// If the narrow use is a compare instruction, then widen the compare +// (and possibly the other operand). The extend operation is hoisted into the +// loop preheader as far as possible. +bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { + ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse); + if (!Cmp) + return false; + + // Sign of IV user and compare must match. + if (IsSigned != CmpInst::isSigned(Cmp->getPredicate())) + return false; + + Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); + unsigned CastWidth = SE->getTypeSizeInBits(Op->getType()); + unsigned IVWidth = SE->getTypeSizeInBits(WideType); + assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); + + // Widen the compare instruction. + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); + + // Widen the other operand of the compare, if necessary. + if (CastWidth < IVWidth) { + Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); + DU.NarrowUse->replaceUsesOfWith(Op, ExtOp); + } + return true; +} + /// WidenIVUse - Determine whether an individual user of the narrow IV can be /// widened. If so, return the wide clone of the user. Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { @@ -975,10 +1048,15 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Does this user itself evaluate to a recurrence after widening? const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + if (!WideAddRec) + WideAddRec = GetExtendedOperandRecurrence(DU); + if (!WideAddRec) { - WideAddRec = GetExtendedOperandRecurrence(DU); - } - if (!WideAddRec) { + // If use is a loop condition, try to promote the condition instead of + // truncating the IV first. + if (WidenLoopCompare(DU)) + return nullptr; + // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. @@ -1024,7 +1102,7 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { Instruction *NarrowUser = cast<Instruction>(U); // Handle data flow merges and bizarre phi cycles. - if (!Widened.insert(NarrowUser)) + if (!Widened.insert(NarrowUser).second) continue; NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); @@ -1124,14 +1202,16 @@ namespace { class IndVarSimplifyVisitor : public IVVisitor { ScalarEvolution *SE; const DataLayout *DL; + const TargetTransformInfo *TTI; PHINode *IVPhi; public: WideIVInfo WI; IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const DataLayout *DL, const DominatorTree *DTree): - SE(SCEV), DL(DL), IVPhi(IV) { + const DataLayout *DL, const TargetTransformInfo *TTI, + const DominatorTree *DTree) + : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) { DT = DTree; WI.NarrowIV = IVPhi; if (ReduceLiveIVs) @@ -1139,7 +1219,9 @@ namespace { } // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); } + void visitCast(CastInst *Cast) override { + visitIVCast(Cast, WI, SE, DL, TTI); + } }; } @@ -1173,7 +1255,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT); + IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT); Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); @@ -1200,9 +1282,9 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, /// BackedgeTakenInfo. If these expressions have not been reduced, then /// expanding them may incur additional cost (albeit in the loop preheader). static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, - SmallPtrSet<const SCEV*, 8> &Processed, + SmallPtrSetImpl<const SCEV*> &Processed, ScalarEvolution *SE) { - if (!Processed.insert(S)) + if (!Processed.insert(S).second) return false; // If the backedge-taken count is a UDiv, it's very likely a UDiv that @@ -1373,7 +1455,7 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils /// down to checking that all operands are constant and listing instructions /// that may hide undef. -static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited, +static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, unsigned Depth) { if (isa<Constant>(V)) return !isa<UndefValue>(V); @@ -1393,7 +1475,7 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited, // Optimistically handle other instructions. for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { - if (!Visited.insert(*OI)) + if (!Visited.insert(*OI).second) continue; if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) return false; @@ -1637,8 +1719,29 @@ LinearFunctionTestReplace(Loop *L, // FIXME: In theory, SCEV could drop flags even though they exist in IR. // A more robust solution would involve getting a new expression for // CmpIndVar by applying non-NSW/NUW AddExprs. + auto WrappingFlags = + ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW); + const SCEV *IVInit = IncrementedIndvarSCEV->getStart(); + if (SE->getTypeSizeInBits(IVInit->getType()) > + SE->getTypeSizeInBits(IVCount->getType())) + IVInit = SE->getTruncateExpr(IVInit, IVCount->getType()); + unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType()); + Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1); + // Check if InitIV + BECount+1 requires sign/zero extension. + // If not, clear the corresponding flag from WrappingFlags because it is not + // necessary for those flags in the IncrementedIndvarSCEV expression. + if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), + WideTy) == + SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy), + SE->getSignExtendExpr(BackedgeTakenCount, WideTy))) + WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW); + if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), + WideTy) == + SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy), + SE->getZeroExtendExpr(BackedgeTakenCount, WideTy))) + WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW); if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(), - SCEV::FlagNUW | SCEV::FlagNSW)) { + WrappingFlags)) { // Add one to the "backedge-taken" count to get the trip count. // This addition may overflow, which is valid as long as the comparison is // truncated to BackedgeTakenCount->getType(). @@ -1832,6 +1935,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); DL = DLP ? &DLP->getDataLayout() : nullptr; TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + TTI = getAnalysisIfAvailable<TargetTransformInfo>(); DeadInsts.clear(); Changed = false; diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 21f80385cf46..78beb3f98dcd 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -44,7 +45,7 @@ STATISTIC(NumFolds, "Number of terminators folded"); STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); static cl::opt<unsigned> -Threshold("jump-threading-threshold", +BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); @@ -87,6 +88,8 @@ namespace { #endif DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; + unsigned BBDupThreshold; + // RAII helper for updating the recursion stack. struct RecursionSetRemover { DenseSet<std::pair<Value*, BasicBlock*> > &TheSet; @@ -102,7 +105,8 @@ namespace { }; public: static char ID; // Pass identification - JumpThreading() : FunctionPass(ID) { + JumpThreading(int T = -1) : FunctionPass(ID) { + BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } @@ -123,9 +127,11 @@ namespace { bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, - ConstantPreference Preference); + ConstantPreference Preference, + Instruction *CxtI = nullptr); bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, - ConstantPreference Preference); + ConstantPreference Preference, + Instruction *CxtI = nullptr); bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); @@ -144,7 +150,7 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) // Public interface to the Jump Threading pass -FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } +FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); } /// runOnFunction - Top level algorithm. /// @@ -182,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) { // If the block is trivially dead, zap it. This eliminates the successor // edges which simplifies the CFG. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) { DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() << "' with terminator: " << *BB->getTerminator() << '\n'); @@ -339,7 +345,8 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { /// bool JumpThreading:: ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, - ConstantPreference Preference) { + ConstantPreference Preference, + Instruction *CxtI) { // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To // prevent this, keep track of what (value, block) pairs we've already visited @@ -381,7 +388,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, BasicBlock *P = *PI; // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. - Constant *PredCst = LVI->getConstantOnEdge(V, P, BB); + Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI); if (Constant *KC = getKnownConstant(PredCst, Preference)) Result.push_back(std::make_pair(KC, P)); } @@ -397,7 +404,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); } else { Constant *CI = LVI->getConstantOnEdge(InVal, - PN->getIncomingBlock(i), BB); + PN->getIncomingBlock(i), + BB, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); } @@ -416,9 +424,9 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, if (I->getOpcode() == Instruction::Or || I->getOpcode() == Instruction::And) { ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, - WantInteger); + WantInteger, CxtI); ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, - WantInteger); + WantInteger, CxtI); if (LHSVals.empty() && RHSVals.empty()) return false; @@ -459,7 +467,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, isa<ConstantInt>(I->getOperand(1)) && cast<ConstantInt>(I->getOperand(1))->isOne()) { ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, - WantInteger); + WantInteger, CxtI); if (Result.empty()) return false; @@ -477,7 +485,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { PredValueInfoTy LHSVals; ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, - WantInteger); + WantInteger, CxtI); // Try to use constant folding to simplify the binary operator. for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { @@ -511,7 +519,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, LazyValueInfo::Tristate ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, - cast<Constant>(RHS), PredBB, BB); + cast<Constant>(RHS), PredBB, BB, + CxtI ? CxtI : Cmp); if (ResT == LazyValueInfo::Unknown) continue; Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); @@ -524,7 +533,6 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, return !Result.empty(); } - // If comparing a live-in value against a constant, see if we know the // live-in value on any predecessors. if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) { @@ -538,7 +546,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // predecessor, use that information to try to thread this block. LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), - RHSCst, P, BB); + RHSCst, P, BB, CxtI ? CxtI : Cmp); if (Res == LazyValueInfo::Unknown) continue; @@ -554,7 +562,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) { PredValueInfoTy LHSVals; ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, - WantInteger); + WantInteger, CxtI); for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { Constant *V = LHSVals[i].first; @@ -577,7 +585,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, PredValueInfoTy Conds; if ((TrueVal || FalseVal) && ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, - WantInteger)) { + WantInteger, CxtI)) { for (unsigned i = 0, e = Conds.size(); i != e; ++i) { Constant *Cond = Conds[i].first; @@ -604,7 +612,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, } // If all else fails, see if LVI can figure out a constant value for us. - Constant *CI = LVI->getConstant(V, BB); + Constant *CI = LVI->getConstant(V, BB, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) Result.push_back(std::make_pair(KC, *PI)); @@ -654,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) { bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. - if (pred_begin(BB) == pred_end(BB) && + if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) return false; @@ -744,7 +752,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // All the rest of our checks depend on the condition being an instruction. if (!CondInst) { // FIXME: Unify this with code below. - if (ProcessThreadableEdges(Condition, BB, Preference)) + if (ProcessThreadableEdges(Condition, BB, Preference, Terminator)) return true; return false; } @@ -766,13 +774,14 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // FIXME: We could handle mixed true/false by duplicating code. LazyValueInfo::Tristate Baseline = LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0), - CondConst, *PI, BB); + CondConst, *PI, BB, CondCmp); if (Baseline != LazyValueInfo::Unknown) { // Check that all remaining incoming values match the first one. while (++PI != PE) { LazyValueInfo::Tristate Ret = LVI->getPredicateOnEdge(CondCmp->getPredicate(), - CondCmp->getOperand(0), CondConst, *PI, BB); + CondCmp->getOperand(0), CondConst, *PI, BB, + CondCmp); if (Ret != Baseline) break; } @@ -787,6 +796,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } } + } else if (CondBr && CondConst && CondBr->isConditional()) { + // There might be an invariant in the same block with the conditional + // that can determine the predicate. + + LazyValueInfo::Tristate Ret = + LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), + CondConst, CondCmp); + if (Ret != LazyValueInfo::Unknown) { + unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0; + unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1; + CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); + BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); + CondBr->eraseFromParent(); + return true; + } } if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB)) @@ -814,7 +838,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. // - if (ProcessThreadableEdges(CondInst, BB, Preference)) + if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator)) return true; // If this is an otherwise-unfoldable branch on a phi node in the current @@ -877,6 +901,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If the returned value is the load itself, replace with an undef. This can // only happen in dead loops. if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); + if (AvailableVal->getType() != LI->getType()) + AvailableVal = + CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI); LI->replaceAllUsesWith(AvailableVal); LI->eraseFromParent(); return true; @@ -888,9 +915,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (BBIt != LoadBB->begin()) return false; - // If all of the loads and stores that feed the value have the same TBAA tag, - // then we can propagate it onto any newly inserted loads. - MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); + // If all of the loads and stores that feed the value have the same AA tags, + // then we can propagate them onto any newly inserted loads. + AAMDNodes AATags; + LI->getAAMetadata(AATags); SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; @@ -904,21 +932,21 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { BasicBlock *PredBB = *PI; // If we already scanned this predecessor, skip it. - if (!PredsScanned.insert(PredBB)) + if (!PredsScanned.insert(PredBB).second) continue; // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); - MDNode *ThisTBAATag = nullptr; + AAMDNodes ThisAATags; Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, - nullptr, &ThisTBAATag); + nullptr, &ThisAATags); if (!PredAvailable) { OneUnavailablePred = PredBB; continue; } - // If tbaa tags disagree or are not present, forget about them. - if (TBAATag != ThisTBAATag) TBAATag = nullptr; + // If AA tags disagree or are not present, forget about them. + if (AATags != ThisAATags) AATags = AAMDNodes(); // If so, this load is partially redundant. Remember this info so that we // can create a PHI node. @@ -978,8 +1006,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { LI->getAlignment(), UnavailablePred->getTerminator()); NewVal->setDebugLoc(LI->getDebugLoc()); - if (TBAATag) - NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag); + if (AATags) + NewVal->setAAMetadata(AATags); AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); } @@ -1006,7 +1034,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { assert(I != AvailablePreds.end() && I->first == P && "Didn't find entry for predecessor!"); - PN->addIncoming(I->second, I->first); + // If we have an available predecessor but it requires casting, insert the + // cast in the predecessor and use the cast. Note that we have to update the + // AvailablePreds vector as we go so that all of the PHI entries for this + // predecessor use the same bitcast. + Value *&PredV = I->second; + if (PredV->getType() != LI->getType()) + PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "", + P->getTerminator()); + + PN->addIncoming(PredV, I->first); } //cerr << "PRE: " << *LI << *PN << "\n"; @@ -1081,14 +1118,15 @@ FindMostPopularDest(BasicBlock *BB, } bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, - ConstantPreference Preference) { + ConstantPreference Preference, + Instruction *CxtI) { // If threading this would thread across a loop header, don't even try to // thread the edge. if (LoopHeaders.count(BB)) return false; PredValueInfoTy PredValues; - if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference)) + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI)) return false; assert(!PredValues.empty() && @@ -1113,7 +1151,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { BasicBlock *Pred = PredValues[i].second; - if (!SeenPreds.insert(Pred)) + if (!SeenPreds.insert(Pred).second) continue; // Duplicate predecessor entry. // If the predecessor ends with an indirect goto, we can't change its @@ -1253,10 +1291,10 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { PredValueInfoTy XorOpValues; bool isLHS = true; if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, - WantInteger)) { + WantInteger, BO)) { assert(XorOpValues.empty()); if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, - WantInteger)) + WantInteger, BO)) return false; isLHS = false; } @@ -1366,8 +1404,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold); - if (JumpThreadCost > Threshold) { + unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + if (JumpThreadCost > BBDupThreshold) { DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); return false; @@ -1509,8 +1547,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold); - if (DuplicationCost > Threshold) { + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + if (DuplicationCost > BBDupThreshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); return false; @@ -1672,10 +1710,10 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { // cases will be threaded in any case. LazyValueInfo::Tristate LHSFolds = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), - CondRHS, Pred, BB); + CondRHS, Pred, BB, CondCmp); LazyValueInfo::Tristate RHSFolds = LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), - CondRHS, Pred, BB); + CondRHS, Pred, BB, CondCmp); if ((LHSFolds != LazyValueInfo::Unknown || RHSFolds != LazyValueInfo::Unknown) && LHSFolds != RHSFolds) { diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index abcceb20050a..e145981846d9 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -120,6 +120,7 @@ namespace { bool MayThrow; // The current loop contains an instruction which // may throw, thus preventing code motion of // instructions with side effects. + bool HeaderMayThrow; // Same as previous, but specific to loop header DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. @@ -130,6 +131,9 @@ namespace { /// set. void deleteAnalysisValue(Value *V, Loop *L) override; + /// Simple Analysis hook. Delete loop L from alias set map. + void deleteAnalysisLoop(Loop *L) override; + /// SinkRegion - Walk the specified region of the CFG (defined by all blocks /// dominated by the specified block, and that are in the current loop) in /// reverse depth first order w.r.t the DominatorTree. This allows us to @@ -180,9 +184,9 @@ namespace { /// store into the memory location pointed to by V. /// bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const MDNode *TBAAInfo) { + const AAMDNodes &AAInfo) { // Check to see if any of the basic blocks in CurLoop invalidate *V. - return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod(); + return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); } bool canSinkOrHoistInst(Instruction &I); @@ -270,7 +274,12 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST->add(*BB); // Incorporate the specified basic block } - MayThrow = false; + HeaderMayThrow = false; + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); + (I != E) && !HeaderMayThrow; ++I) + HeaderMayThrow |= I->mayThrow(); + MayThrow = HeaderMayThrow; // TODO: We've already searched for instructions which may throw in subloops. // We may want to reuse this information. for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end(); @@ -313,7 +322,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. if (Changed) - formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>()); + formLCSSARecursively(*L, *DT, LI, + getAnalysisIfAvailable<ScalarEvolution>()); } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -441,15 +451,18 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { // in the same alias set as something that ends up being modified. if (AA->pointsToConstantMemory(LI->getOperand(0))) return true; - if (LI->getMetadata("invariant.load")) + if (LI->getMetadata(LLVMContext::MD_invariant_load)) return true; // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) Size = AA->getTypeStoreSize(LI->getType()); - return !pointerInvalidatedByLoop(LI->getOperand(0), Size, - LI->getMetadata(LLVMContext::MD_tbaa)); + + AAMDNodes AAInfo; + LI->getAAMetadata(AAInfo); + + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { // Don't sink or hoist dbg info; it's legal, but not useful. if (isa<DbgInfoIntrinsic>(I)) @@ -594,8 +607,13 @@ void LICM::sink(Instruction &I) { // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. while (!I.use_empty()) { + Instruction *User = I.user_back(); + if (!DT->isReachableFromEntry(User->getParent())) { + User->replaceUsesOfWith(&I, UndefValue::get(I.getType())); + continue; + } // The user must be a PHI node. - PHINode *PN = cast<PHINode>(I.user_back()); + PHINode *PN = cast<PHINode>(User); BasicBlock *ExitBlock = PN->getParent(); assert(ExitBlockSet.count(ExitBlock) && @@ -647,12 +665,7 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { bool LICM::isGuaranteedToExecute(Instruction &Inst) { - // Somewhere in this loop there is an instruction which may throw and make us - // exit the loop. - if (MayThrow) - return false; - - // Otherwise we have to check to make sure that the instruction dominates all + // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -660,7 +673,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { // common), it is always guaranteed to dominate the exit blocks. Since this // is a common case, and can save some work, check it now. if (Inst.getParent() == CurLoop->getHeader()) - return true; + // If there's a throw in the header block, we can't guarantee we'll reach + // Inst. + return !HeaderMayThrow; + + // Somewhere in this loop there is an instruction which may throw and make us + // exit the loop. + if (MayThrow) + return false; // Get the exit blocks for the current loop. SmallVector<BasicBlock*, 8> ExitBlocks; @@ -682,7 +702,7 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { namespace { class LoopPromoter : public LoadAndStorePromoter { Value *SomePtr; // Designated pointer to store to. - SmallPtrSet<Value*, 4> &PointerMustAliases; + SmallPtrSetImpl<Value*> &PointerMustAliases; SmallVectorImpl<BasicBlock*> &LoopExitBlocks; SmallVectorImpl<Instruction*> &LoopInsertPts; PredIteratorCache &PredCache; @@ -690,7 +710,7 @@ namespace { LoopInfo &LI; DebugLoc DL; int Alignment; - MDNode *TBAATag; + AAMDNodes AATags; Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { if (Instruction *I = dyn_cast<Instruction>(V)) @@ -710,14 +730,14 @@ namespace { public: LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts, - SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA, + SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA, SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, - MDNode *TBAATag) + const AAMDNodes &AATags) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), - LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} + LI(li), DL(dl), Alignment(alignment), AATags(AATags) {} bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &) const override { @@ -743,7 +763,7 @@ namespace { StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); - if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); + if (AATags) NewSI->setAAMetadata(AATags); } } @@ -798,11 +818,12 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; - MDNode *TBAATag = nullptr; + AAMDNodes AATags; + bool HasDedicatedExits = CurLoop->hasDedicatedExits(); // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in - // different sizes. While we are at it, collect alignment and TBAA info. + // different sizes. While we are at it, collect alignment and AA info. for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { Value *ASIV = ASI->getValue(); PointerMustAliases.insert(ASIV); @@ -833,6 +854,13 @@ void LICM::PromoteAliasSet(AliasSet &AS, assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) return; + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -855,13 +883,12 @@ void LICM::PromoteAliasSet(AliasSet &AS, } else return; // Not a load or store. - // Merge the TBAA tags. + // Merge the AA tags. if (LoopUses.empty()) { - // On the first load/store, just take its TBAA tag. - TBAATag = UI->getMetadata(LLVMContext::MD_tbaa); - } else if (TBAATag) { - TBAATag = MDNode::getMostGenericTBAA(TBAATag, - UI->getMetadata(LLVMContext::MD_tbaa)); + // On the first load/store, just take its AA tags. + UI->getAAMetadata(AATags); + } else if (AATags) { + UI->getAAMetadata(AATags, /* Merge = */ true); } LoopUses.push_back(UI); @@ -896,7 +923,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag); + InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. @@ -905,7 +932,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, Preheader->getTerminator()); PreheaderLoad->setAlignment(Alignment); PreheaderLoad->setDebugLoc(DL); - if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag); + if (AATags) PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); // Rewrite all the loads in the loop and remember all the definitions from @@ -936,3 +963,13 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) { AST->deleteValue(V); } + +/// Simple Analysis hook. Delete value L from alias set map. +void LICM::deleteAnalysisLoop(Loop *L) { + AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); + if (!AST) + return; + + delete AST; + LoopToAliasSetMap.erase(L); +} diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt index 1f6df7dac7ff..2bb49a3026c9 100644 --- a/lib/Transforms/Scalar/LLVMBuild.txt +++ b/lib/Transforms/Scalar/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = Scalar parent = Transforms library_name = ScalarOpts -required_libraries = Analysis Core IPA InstCombine Support Target TransformUtils +required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp index 846aa703c9c3..11e4d7606d96 100644 --- a/lib/Transforms/Scalar/LoadCombine.cpp +++ b/lib/Transforms/Scalar/LoadCombine.cpp @@ -15,6 +15,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Pass.h" #include "llvm/IR/DataLayout.h" @@ -51,13 +53,16 @@ struct LoadPOPPair { class LoadCombine : public BasicBlockPass { LLVMContext *C; const DataLayout *DL; + AliasAnalysis *AA; public: LoadCombine() : BasicBlockPass(ID), - C(nullptr), DL(nullptr) { + C(nullptr), DL(nullptr), AA(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } + + using llvm::Pass::doInitialization; bool doInitialization(Function &) override; bool runOnBasicBlock(BasicBlock &BB) override; void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -223,19 +228,23 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { if (skipOptnoneFunction(BB) || !DL) return false; + AA = &getAnalysis<AliasAnalysis>(); + IRBuilder<true, TargetFolder> TheBuilder(BB.getContext(), TargetFolder(DL)); Builder = &TheBuilder; DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; + AliasSetTracker AST(*AA); bool Combined = false; unsigned Index = 0; for (auto &I : BB) { - if (I.mayWriteToMemory() || I.mayThrow()) { + if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) { if (combineLoads(LoadMap)) Combined = true; LoadMap.clear(); + AST.clear(); continue; } LoadInst *LI = dyn_cast<LoadInst>(&I); @@ -248,6 +257,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { if (!POP.Pointer) continue; LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++)); + AST.add(LI); } if (combineLoads(LoadMap)) Combined = true; @@ -256,6 +266,9 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); + + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); } char LoadCombine::ID = 0; @@ -264,5 +277,9 @@ BasicBlockPass *llvm::createLoadCombinePass() { return new LoadCombine(); } -INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false, - false) +INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads", + false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads", + false, false) + diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 5ab686aa831a..1d1f33ae6183 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -239,9 +239,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { LoopInfo &loopInfo = getAnalysis<LoopInfo>(); SmallPtrSet<BasicBlock*, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); - for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(), - E = blocks.end(); I != E; ++I) - loopInfo.removeBlock(*I); + for (BasicBlock *BB : blocks) + loopInfo.removeBlock(BB); // The last step is to inform the loop pass manager that we've // eliminated this loop. diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index ab1a9393c526..1ac38e0f52a5 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -41,6 +42,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); @@ -54,6 +56,7 @@ namespace { char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -76,6 +79,8 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -116,7 +121,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, DL, TLI, DT); + Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) @@ -148,7 +153,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) { BasicBlock *SuccBB = *SI; - if (!Visited.insert(SuccBB)) + if (!Visited.insert(SuccBB).second) continue; const Loop *SuccLoop = LI->getLoopFor(SuccBB); @@ -161,7 +166,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { BasicBlock *ExitBB = SubLoopExitBlocks[i]; - if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB)) + if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second) VisitStack.push_back(WorklistItem(ExitBB, false)); } diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index b6fbb16166dd..8f122041c248 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -215,9 +215,7 @@ protected: typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; // Add a new possible reduction. - void addSLR(SimpleLoopReduction &SLR) { - PossibleReds.push_back(SLR); - } + void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); } // Setup to track possible reductions corresponding to the provided // rerolling scale. Only reductions with a number of non-PHI instructions @@ -225,7 +223,8 @@ protected: // are filled in: // - A set of all possible instructions in eligible reductions. // - A set of all PHIs in eligible reductions - // - A set of all reduced values (last instructions) in eligible reductions. + // - A set of all reduced values (last instructions) in eligible + // reductions. void restrictToScale(uint64_t Scale, SmallInstructionSet &PossibleRedSet, SmallInstructionSet &PossibleRedPHISet, @@ -238,13 +237,12 @@ protected: if (PossibleReds[i].size() % Scale == 0) { PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); PossibleRedPHISet.insert(PossibleReds[i].getPHI()); - + PossibleRedSet.insert(PossibleReds[i].getPHI()); PossibleRedIdx[PossibleReds[i].getPHI()] = i; - for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), - JE = PossibleReds[i].end(); J != JE; ++J) { - PossibleRedSet.insert(*J); - PossibleRedIdx[*J] = i; + for (Instruction *J : PossibleReds[i]) { + PossibleRedSet.insert(J); + PossibleRedIdx[J] = i; } } } @@ -487,7 +485,7 @@ void LoopReroll::collectInLoopUserSet(Loop *L, if (PN->getIncomingBlock(U) == L->getHeader()) continue; } - + if (L->contains(User) && !Exclude.count(User)) { Queue.push_back(User); } @@ -659,16 +657,15 @@ bool LoopReroll::ReductionTracker::validateSelected() { RI != RIE; ++RI) { int i = *RI; int PrevIter = 0, BaseCount = 0, Count = 0; - for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), - JE = PossibleReds[i].end(); J != JE; ++J) { - // Note that all instructions in the chain must have been found because - // all instructions in the function must have been assigned to some - // iteration. - int Iter = PossibleRedIter[*J]; + for (Instruction *J : PossibleReds[i]) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[J]; if (Iter != PrevIter && Iter != PrevIter + 1 && !PossibleReds[i].getReducedValue()->isAssociative()) { DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << - *J << "\n"); + J << "\n"); return false; } @@ -881,7 +878,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, // needed because otherwise isSafeToSpeculativelyExecute returns // false on PHI nodes. if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) - FutureSideEffects = true; + FutureSideEffects = true; } ++J2; @@ -952,9 +949,9 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { Value *Op2 = J2->getOperand(j); - // If this is part of a reduction (and the operation is not - // associatve), then we match all operands, but not those that are - // part of the reduction. + // If this is part of a reduction (and the operation is not + // associatve), then we match all operands, but not those that are + // part of the reduction. if (InReduction) if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) if (Reductions.isPairInSame(J2, Op2I)) @@ -968,11 +965,11 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, Op2 = IV; if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { - // If we've not already decided to swap the matched operands, and - // we've not already matched our first operand (note that we could - // have skipped matching the first operand because it is part of a - // reduction above), and the instruction is commutative, then try - // the swapped match. + // If we've not already decided to swap the matched operands, and + // we've not already matched our first operand (note that we could + // have skipped matching the first operand because it is part of a + // reduction above), and the instruction is commutative, then try + // the swapped match. if (!Swapped && J1->isCommutative() && !SomeOpMatched && J1->getOperand(!j) == Op2) { Swapped = true; @@ -1069,7 +1066,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, continue; } - ++J; + ++J; } // Insert the new induction variable. @@ -1110,9 +1107,9 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), Preheader->getTerminator()); } - - Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, - "exitcond"); + + Value *Cond = + new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); BI->setCondition(Cond); if (BI->getSuccessor(1) != Header) @@ -1182,4 +1179,3 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } - diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 2ce58314f8ef..9164be224654 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" @@ -53,6 +54,7 @@ namespace { // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); @@ -72,12 +74,14 @@ namespace { unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; + AssumptionCache *AC; }; } char LoopRotate::ID = 0; INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -98,6 +102,8 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the @@ -184,13 +190,18 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -/// Determine whether the instructions in this range my be safely and cheaply +/// Determine whether the instructions in this range may be safely and cheaply /// speculated. This is not an important enough situation to develop complex /// heuristics. We handle a single arithmetic instruction along with any type /// conversions. static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End) { + BasicBlock::iterator End, Loop *L) { bool seenIncrement = false; + bool MultiExitLoop = false; + + if (!L->getExitingBlock()) + MultiExitLoop = true; + for (BasicBlock::iterator I = Begin; I != End; ++I) { if (!isSafeToSpeculativelyExecute(I)) @@ -214,11 +225,33 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, case Instruction::Xor: case Instruction::Shl: case Instruction::LShr: - case Instruction::AShr: + case Instruction::AShr: { + Value *IVOpnd = nullptr; + if (isa<ConstantInt>(I->getOperand(0))) + IVOpnd = I->getOperand(1); + + if (isa<ConstantInt>(I->getOperand(1))) { + if (IVOpnd) + return false; + + IVOpnd = I->getOperand(0); + } + + // If increment operand is used outside of the loop, this speculation + // could cause extra live range interference. + if (MultiExitLoop && IVOpnd) { + for (User *UseI : IVOpnd->users()) { + auto *UserInst = cast<Instruction>(UseI); + if (!L->contains(UserInst)) + return false; + } + } + if (seenIncrement) return false; seenIncrement = true; break; + } case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: @@ -232,7 +265,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, /// Fold the loop tail into the loop exit by speculating the loop tail /// instructions. Typically, this is a single post-increment. In the case of a /// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the cast of loops with early exits, +/// duplicating the entire loop header. In the case of loops with early exits, /// rotation will not work anyway, but simplifyLoopLatch will put the loop in /// canonical form so downstream passes can handle it. /// @@ -254,7 +287,7 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { if (!BI) return false; - if (!shouldSpeculateInstrs(Latch->begin(), Jmp)) + if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L)) return false; DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " @@ -323,8 +356,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Check size of original header and reject loop if it is very big or we can't // duplicate blocks inside it. { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI); + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); if (Metrics.notDuplicatable) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); @@ -406,6 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. + // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction. Value *V = SimplifyInstruction(C); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 914b56aa8167..7b60373dc508 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -744,7 +744,7 @@ static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { /// TODO: Allow UDivExpr if we can find an existing IV increment that is an /// obvious multiple of the UDivExpr. static bool isHighCostExpansion(const SCEV *S, - SmallPtrSet<const SCEV*, 8> &Processed, + SmallPtrSetImpl<const SCEV*> &Processed, ScalarEvolution &SE) { // Zero/One operand expressions switch (S->getSCEVType()) { @@ -762,7 +762,7 @@ static bool isHighCostExpansion(const SCEV *S, Processed, SE); } - if (!Processed.insert(S)) + if (!Processed.insert(S).second) return false; if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -892,34 +892,34 @@ public: void RateFormula(const TargetTransformInfo &TTI, const Formula &F, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, - SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr); + SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr); void print(raw_ostream &OS) const; void dump() const; private: void RateRegister(const SCEV *Reg, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT); void RatePrimaryRegister(const SCEV *Reg, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSet<const SCEV *, 16> *LoserRegs); + SmallPtrSetImpl<const SCEV *> *LoserRegs); }; } /// RateRegister - Tally up interesting quantities from the given register. void Cost::RateRegister(const SCEV *Reg, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { @@ -967,15 +967,15 @@ void Cost::RateRegister(const SCEV *Reg, /// before, rate it. Optional LoserRegs provides a way to declare any formula /// that refers to one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSet<const SCEV *, 16> *LoserRegs) { + SmallPtrSetImpl<const SCEV *> *LoserRegs) { if (LoserRegs && LoserRegs->count(Reg)) { Lose(); return; } - if (Regs.insert(Reg)) { + if (Regs.insert(Reg).second) { RateRegister(Reg, Regs, L, SE, DT); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); @@ -984,13 +984,13 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, void Cost::RateFormula(const TargetTransformInfo &TTI, const Formula &F, - SmallPtrSet<const SCEV *, 16> &Regs, + SmallPtrSetImpl<const SCEV *> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, - SmallPtrSet<const SCEV *, 16> *LoserRegs) { + SmallPtrSetImpl<const SCEV *> *LoserRegs) { assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -1337,10 +1337,9 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { } // Update the RegTracker. - for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(), - E = OldRegs.end(); I != E; ++I) - if (!Regs.count(*I)) - RegUses.DropRegister(*I, LUIdx); + for (const SCEV *S : OldRegs) + if (!Regs.count(S)) + RegUses.DropRegister(S, LUIdx); } void LSRUse::print(raw_ostream &OS) const { @@ -2226,13 +2225,12 @@ LSRInstance::OptimizeLoopTermCond() { // must dominate all the post-inc comparisons we just set up, and it must // dominate the loop latch edge. IVIncInsertPos = L->getLoopLatch()->getTerminator(); - for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(), - E = PostIncs.end(); I != E; ++I) { + for (Instruction *Inst : PostIncs) { BasicBlock *BB = DT.findNearestCommonDominator(IVIncInsertPos->getParent(), - (*I)->getParent()); - if (BB == (*I)->getParent()) - IVIncInsertPos = *I; + Inst->getParent()); + if (BB == Inst->getParent()) + IVIncInsertPos = Inst; else if (BB != IVIncInsertPos->getParent()) IVIncInsertPos = BB->getTerminator(); } @@ -2557,7 +2555,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// /// TODO: Consider IVInc free if it's already used in another chains. static bool -isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, +isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI) { if (StressIVChain) return true; @@ -2567,9 +2565,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, if (!Users.empty()) { DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; - for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), - E = Users.end(); I != E; ++I) { - dbgs() << " " << **I << "\n"; + for (Instruction *Inst : Users) { + dbgs() << " " << *Inst << "\n"; }); return false; } @@ -2805,7 +2802,7 @@ void LSRInstance::CollectChains() { User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); while (IVOpIter != IVOpEnd) { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); - if (UniqueOperands.insert(IVOpInst)) + if (UniqueOperands.insert(IVOpInst).second) ChainInstruction(I, IVOpInst, ChainUsersVec); IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } @@ -3119,11 +3116,15 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { void LSRInstance::CollectLoopInvariantFixupsAndFormulae() { SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); - SmallPtrSet<const SCEV *, 8> Inserted; + SmallPtrSet<const SCEV *, 32> Visited; while (!Worklist.empty()) { const SCEV *S = Worklist.pop_back_val(); + // Don't process the same SCEV twice + if (!Visited.insert(S).second) + continue; + if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) Worklist.append(N->op_begin(), N->op_end()); else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) @@ -3132,7 +3133,6 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { Worklist.push_back(D->getLHS()); Worklist.push_back(D->getRHS()); } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { - if (!Inserted.insert(US)) continue; const Value *V = US->getValue(); if (const Instruction *Inst = dyn_cast<Instruction>(V)) { // Look for instructions defined outside the loop. @@ -3774,7 +3774,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; LUIdx = UsedByIndices.find_next(LUIdx)) // Make a memo of this use, offset, and register tuple. - if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) + if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); } } @@ -4302,10 +4302,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // reference that register in order to be considered. This prunes out // unprofitable searching. SmallSetVector<const SCEV *, 4> ReqRegs; - for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(), - E = CurRegs.end(); I != E; ++I) - if (LU.Regs.count(*I)) - ReqRegs.insert(*I); + for (const SCEV *S : CurRegs) + if (LU.Regs.count(S)) + ReqRegs.insert(S); SmallPtrSet<const SCEV *, 16> NewRegs; Cost NewCost; @@ -4350,9 +4349,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, } else { DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); dbgs() << ".\n Regs:"; - for (SmallPtrSet<const SCEV *, 16>::const_iterator - I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I) - dbgs() << ' ' << **I; + for (const SCEV *S : NewRegs) + dbgs() << ' ' << *S; dbgs() << '\n'); SolutionCost = NewCost; diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 935f289f040f..fef52107f623 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,7 +13,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/FunctionTargetTransformInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -52,7 +54,7 @@ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, static cl::opt<unsigned> PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, - cl::desc("Unrolled size limit for loops with an unroll(enable) or " + cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma.")); namespace { @@ -102,6 +104,7 @@ namespace { /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -111,6 +114,7 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); AU.addRequired<TargetTransformInfo>(); + AU.addRequired<FunctionTargetTransformInfo>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. @@ -120,7 +124,7 @@ namespace { // Fill in the UnrollingPreferences parameter with values from the // TargetTransformationInfo. - void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, + void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI, TargetTransformInfo::UnrollingPreferences &UP) { UP.Threshold = CurrentThreshold; UP.OptSizeThreshold = OptSizeUnrollThreshold; @@ -130,7 +134,7 @@ namespace { UP.MaxCount = UINT_MAX; UP.Partial = CurrentAllowPartial; UP.Runtime = CurrentRuntime; - TTI.getUnrollingPreferences(L, UP); + FTTI.getUnrollingPreferences(L, UP); } // Select and return an unroll count based on parameters from @@ -138,12 +142,11 @@ namespace { // SetExplicitly is set to true if the unroll count is is set by // the user or a pragma rather than selected heuristically. unsigned - selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma, + selectUnrollCount(const Loop *L, unsigned TripCount, bool PragmaFullUnroll, unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, bool &SetExplicitly); - // Select threshold values used to limit unrolling based on a // total unrolled size. Parameters Threshold and PartialThreshold // are set to the maximum unrolled size for fully and partially @@ -183,6 +186,8 @@ namespace { char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -201,11 +206,15 @@ Pass *llvm::createSimpleLoopUnrollPass() { /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + AssumptionCache *AC) { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI); + Metrics.analyzeBasicBlock(*I, TTI, EphValues); NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; @@ -213,19 +222,22 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's - // not a problem for code quality. - if (LoopSize == 0) LoopSize = 1; + // not a problem for code quality. Also, the code using this size may assume + // that each loop has at least three instructions (likely a conditional + // branch, a comparison feeding that branch, and some kind of loop increment + // feeding that comparison instruction). + LoopSize = std::max(LoopSize, 3u); return LoopSize; } -// Returns the value associated with the given metadata node name (for -// example, "llvm.loop.unroll.count"). If no such named metadata node -// exists, then nullptr is returned. -static const ConstantInt *GetUnrollMetadataValue(const Loop *L, - StringRef Name) { +// Returns the loop hint metadata node with the given name (for example, +// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is +// returned. +static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) { MDNode *LoopID = L->getLoopID(); - if (!LoopID) return nullptr; + if (!LoopID) + return nullptr; // First operand should refer to the loop id itself. assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); @@ -233,41 +245,38 @@ static const ConstantInt *GetUnrollMetadataValue(const Loop *L, for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (!MD) continue; + if (!MD) + continue; const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - if (!S) continue; + if (!S) + continue; - if (Name.equals(S->getString())) { - assert(MD->getNumOperands() == 2 && - "Unroll hint metadata should have two operands."); - return cast<ConstantInt>(MD->getOperand(1)); - } + if (Name.equals(S->getString())) + return MD; } return nullptr; } -// Returns true if the loop has an unroll(enable) pragma. -static bool HasUnrollEnablePragma(const Loop *L) { - const ConstantInt *EnableValue = - GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); - return (EnableValue && EnableValue->getZExtValue()); +// Returns true if the loop has an unroll(full) pragma. +static bool HasUnrollFullPragma(const Loop *L) { + return GetUnrollMetadata(L, "llvm.loop.unroll.full"); } // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { - const ConstantInt *EnableValue = - GetUnrollMetadataValue(L, "llvm.loop.unroll.enable"); - return (EnableValue && !EnableValue->getZExtValue()); + return GetUnrollMetadata(L, "llvm.loop.unroll.disable"); } // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { - const ConstantInt *CountValue = - GetUnrollMetadataValue(L, "llvm.loop.unroll.count"); - if (CountValue) { - unsigned Count = CountValue->getZExtValue(); + const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count"); + if (MD) { + assert(MD->getNumOperands() == 2 && + "Unroll count hint metadata should have two operands."); + unsigned Count = + mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue(); assert(Count >= 1 && "Unroll count must be positive."); return Count; } @@ -283,9 +292,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) { if (!LoopID) return; // First remove any existing loop unrolling metadata. - SmallVector<Value *, 4> Vals; + SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. - Vals.push_back(nullptr); + MDs.push_back(nullptr); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { bool IsUnrollMetadata = false; MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); @@ -293,26 +302,25 @@ static void SetLoopAlreadyUnrolled(Loop *L) { const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); } - if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i)); + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } // Add unroll(disable) metadata to disable future unrolling. LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Value *, 2> DisableOperands; - DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.enable")); - DisableOperands.push_back(ConstantInt::get(Type::getInt1Ty(Context), 0)); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); MDNode *DisableNode = MDNode::get(Context, DisableOperands); - Vals.push_back(DisableNode); + MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, Vals); + MDNode *NewLoopID = MDNode::get(Context, MDs); // Set operand 0 to refer to the loop id itself. NewLoopID->replaceOperandWith(0, NewLoopID); L->setLoopID(NewLoopID); - LoopID->replaceAllUsesWith(NewLoopID); } unsigned LoopUnroll::selectUnrollCount( - const Loop *L, unsigned TripCount, bool HasEnablePragma, + const Loop *L, unsigned TripCount, bool PragmaFullUnroll, unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, bool &SetExplicitly) { SetExplicitly = true; @@ -326,9 +334,7 @@ unsigned LoopUnroll::selectUnrollCount( if (Count == 0) { if (PragmaCount) { Count = PragmaCount; - } else if (HasEnablePragma) { - // unroll(enable) pragma without an unroll_count pragma - // indicates to unroll loop fully. + } else if (PragmaFullUnroll) { Count = TripCount; } } @@ -360,6 +366,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { LoopInfo *LI = &getAnalysis<LoopInfo>(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + const FunctionTargetTransformInfo &FTTI = + getAnalysis<FunctionTargetTransformInfo>(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -368,37 +378,43 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (HasUnrollDisablePragma(L)) { return false; } - bool HasEnablePragma = HasUnrollEnablePragma(L); + bool PragmaFullUnroll = HasUnrollFullPragma(L); unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = HasEnablePragma || PragmaCount > 0; + bool HasPragma = PragmaFullUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - getUnrollingPreferences(L, TTI, UP); + getUnrollingPreferences(L, FTTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; unsigned TripMultiple = 1; - // Find "latch trip count". UnrollLoop assumes that control cannot exit - // via the loop latch on any iteration prior to TripCount. The loop may exit - // early via an earlier branch. - BasicBlock *LatchBlock = L->getLoopLatch(); - if (LatchBlock) { - TripCount = SE->getSmallConstantTripCount(L, LatchBlock); - TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); + // If there are multiple exiting blocks but one of them is the latch, use the + // latch for the trip count estimation. Otherwise insist on a single exiting + // block for the trip count estimation. + BasicBlock *ExitingBlock = L->getLoopLatch(); + if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) + ExitingBlock = L->getExitingBlock(); + if (ExitingBlock) { + TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); } // Select an initial unroll count. This may be reduced later based // on size thresholds. bool CountSetExplicitly; - unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount, - UP, CountSetExplicitly); + unsigned Count = selectUnrollCount(L, TripCount, PragmaFullUnroll, + PragmaCount, UP, CountSetExplicitly); unsigned NumInlineCandidates; bool notDuplicatable; unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI); + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - uint64_t UnrolledSize = (uint64_t)LoopSize * Count; + + // When computing the unrolled size, note that the conditional branch on the + // backedge and the comparison feeding it are not replicated like the rest of + // the loop body (which is why 2 is subtracted). + uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2; if (notDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" << " instructions.\n"); @@ -443,7 +459,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = PartialThreshold / LoopSize; + Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2); while (Count != 0 && TripCount % Count != 0) Count--; } @@ -457,7 +473,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // the original count which satisfies the threshold limit. while (Count != 0 && UnrolledSize > PartialThreshold) { Count >>= 1; - UnrolledSize = LoopSize * Count; + UnrolledSize = (LoopSize-2) * Count + 2; } if (Count > UP.MaxCount) Count = UP.MaxCount; @@ -465,25 +481,26 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } if (HasPragma) { - // Mark loop as unrolled to prevent unrolling beyond that - // requested by the pragma. - SetLoopAlreadyUnrolled(L); + if (PragmaCount != 0) + // If loop has an unroll count pragma mark loop as unrolled to prevent + // unrolling beyond that requested by the pragma. + SetLoopAlreadyUnrolled(L); // Emit optimization remarks if we are unable to unroll the loop // as directed by a pragma. DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if (HasEnablePragma && PragmaCount == 0) { + if (PragmaFullUnroll && PragmaCount == 0) { if (TripCount && Count != TripCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(enable) pragma " + "Unable to fully unroll loop as directed by unroll(full) pragma " "because unrolled size is too large."); } else if (!TripCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(enable) pragma " + "Unable to fully unroll loop as directed by unroll(full) pragma " "because loop has a runtime trip count."); } } else if (PragmaCount > 0 && Count != OriginalCount) { @@ -501,7 +518,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, + &LPM, &AC)) return false; return true; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 977c53a3bc63..9f4c12270d76 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -103,7 +104,8 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. - bool countLoop(const Loop *L, const TargetTransformInfo &TTI); + bool countLoop(const Loop *L, const TargetTransformInfo &TTI, + AssumptionCache *AC); // Clean all data related to given loop. void forgetLoop(const Loop *L); @@ -126,6 +128,7 @@ namespace { class LoopUnswitch : public LoopPass { LoopInfo *LI; // Loop information LPPassManager *LPM; + AssumptionCache *AC; // LoopProcessWorklist - Used to check if second loop needs processing // after RewriteLoopBodyWithConditionConstant rewrites first loop. @@ -164,6 +167,7 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequired<LoopInfo>(); @@ -212,7 +216,8 @@ namespace { // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. -bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { +bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, + AssumptionCache *AC) { LoopPropsMapIt PropsIt; bool Inserted; @@ -229,13 +234,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { // large numbers of branches which cause loop unswitching to go crazy. // This is a very ad-hoc heuristic. + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + // FIXME: This is overly conservative because it does not take into // consideration code simplification opportunities and code that can // be shared by the resultant unswitched loops. CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI); + Metrics.analyzeBasicBlock(*I, TTI, EphValues); Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); @@ -326,6 +334,7 @@ char LoopUnswitch::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -376,6 +385,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (skipOptnoneFunction(L)) return false; + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); LI = &getAnalysis<LoopInfo>(); LPM = &LPM_Ref; DominatorTreeWrapperPass *DTWP = @@ -421,7 +432,8 @@ bool LoopUnswitch::processCurrentLoop() { // Probably we reach the quota of branches for this loop. If so // stop unswitching. - if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>())) + if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(), + AC)) return false; // Loop over all of the basic blocks in the loop. If we find an interior @@ -823,6 +835,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), NewBlocks[0], F->end()); + // FIXME: We could register any cloned assumptions instead of clearing the + // whole function's cache. + AC->clear(); + // Now we create the new Loop object for the versioned loop. Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM); diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 7c184a4ad2c3..33b5f9df5a27 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -329,6 +330,7 @@ namespace { // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); @@ -361,6 +363,7 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) @@ -631,22 +634,24 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (destSize < srcSize) return false; } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { - // If the destination is an sret parameter then only accesses that are - // outside of the returned struct type can trap. - if (!A->hasStructRetAttr()) - return false; + if (A->getDereferenceableBytes() < srcSize) { + // If the destination is an sret parameter then only accesses that are + // outside of the returned struct type can trap. + if (!A->hasStructRetAttr()) + return false; - Type *StructTy = cast<PointerType>(A->getType())->getElementType(); - if (!StructTy->isSized()) { - // The call may never return and hence the copy-instruction may never - // be executed, and therefore it's not safe to say "the destination - // has at least <cpyLen> bytes, as implied by the copy-instruction", - return false; - } + Type *StructTy = cast<PointerType>(A->getType())->getElementType(); + if (!StructTy->isSized()) { + // The call may never return and hence the copy-instruction may never + // be executed, and therefore it's not safe to say "the destination + // has at least <cpyLen> bytes, as implied by the copy-instruction", + return false; + } - uint64_t destSize = DL->getTypeAllocSize(StructTy); - if (destSize < srcSize) - return false; + uint64_t destSize = DL->getTypeAllocSize(StructTy); + if (destSize < srcSize) + return false; + } } else { return false; } @@ -673,15 +678,23 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { for (User *UU : U->users()) srcUseList.push_back(UU); - } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { - if (G->hasAllZeroIndices()) - for (User *UU : U->users()) - srcUseList.push_back(UU); - else + continue; + } + if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { + if (!G->hasAllZeroIndices()) return false; - } else if (U != C && U != cpy) { - return false; + + for (User *UU : U->users()) + srcUseList.push_back(UU); + continue; } + if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) + if (IT->getIntrinsicID() == Intrinsic::lifetime_start || + IT->getIntrinsicID() == Intrinsic::lifetime_end) + continue; + + if (U != C && U != cpy) + return false; } // Check that src isn't captured by the called function since the @@ -969,8 +982,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *CS->getParent()->getParent()); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC, + CS.getInstruction(), &DT) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index c2467fecb5eb..8509713b3367 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -97,9 +97,6 @@ using namespace llvm; //===----------------------------------------------------------------------===// // MergedLoadStoreMotion Pass //===----------------------------------------------------------------------===// -static cl::opt<bool> -EnableMLSM("mlsm", cl::desc("Enable motion of merged load and store"), - cl::init(true)); namespace { class MergedLoadStoreMotion : public FunctionPass { @@ -134,7 +131,9 @@ private: BasicBlock *getDiamondTail(BasicBlock *BB); bool isDiamondHead(BasicBlock *BB); // Routines for hoisting loads - bool isLoadHoistBarrier(Instruction *Inst); + bool isLoadHoistBarrierInRange(const Instruction& Start, + const Instruction& End, + LoadInst* LI); LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, Instruction *ElseInst); @@ -144,7 +143,9 @@ private: // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); - bool isStoreSinkBarrier(Instruction *Inst); + bool isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location Loc); bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, @@ -235,27 +236,12 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// -bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) { - // FIXME: A call with no side effects should not be a barrier. - // Aren't all such calls covered by mayHaveSideEffects() below? - // Then this check can be removed. - if (isa<CallInst>(Inst)) - return true; - if (isa<TerminatorInst>(Inst)) - return true; - // FIXME: Conservatively let a store instruction block the load. - // Use alias analysis instead. - if (isa<StoreInst>(Inst)) - return true; - // Note: mayHaveSideEffects covers all instructions that could - // trigger a change to state. Eg. in-flight stores have to be executed - // before ordered loads or fences, calls could invoke functions that store - // data to memory etc. - if (Inst->mayHaveSideEffects()) { - return true; - } - DEBUG(dbgs() << "No Hoist Barrier\n"); - return false; + +bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, + const Instruction& End, + LoadInst* LI) { + AliasAnalysis::Location Loc = AA->getLocation(LI); + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); } /// @@ -265,33 +251,29 @@ bool MergedLoadStoreMotion::isLoadHoistBarrier(Instruction *Inst) { /// and it can be hoisted from \p BB, return that load. /// Otherwise return Null. /// -LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB, - LoadInst *LI) { - LoadInst *I = nullptr; - assert(isa<LoadInst>(LI)); - if (LI->isUsedOutsideOfBlock(LI->getParent())) - return nullptr; +LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, + LoadInst *Load0) { - for (BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); BBI != BBE; + for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { Instruction *Inst = BBI; // Only merge and hoist loads when their result in used only in BB - if (isLoadHoistBarrier(Inst)) - break; - if (!isa<LoadInst>(Inst)) - continue; - if (Inst->isUsedOutsideOfBlock(Inst->getParent())) + if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) continue; - AliasAnalysis::Location LocLI = AA->getLocation(LI); - AliasAnalysis::Location LocInst = AA->getLocation((LoadInst *)Inst); - if (AA->isMustAlias(LocLI, LocInst) && LI->getType() == Inst->getType()) { - I = (LoadInst *)Inst; - break; + LoadInst *Load1 = dyn_cast<LoadInst>(Inst); + BasicBlock *BB0 = Load0->getParent(); + + AliasAnalysis::Location Loc0 = AA->getLocation(Load0); + AliasAnalysis::Location Loc1 = AA->getLocation(Load1); + if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) && + !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) && + !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) { + return Load1; } } - return I; + return nullptr; } /// @@ -388,15 +370,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { Instruction *I = BBI; ++BBI; - if (isLoadHoistBarrier(I)) - break; // Only move non-simple (atomic, volatile) loads. - if (!isa<LoadInst>(I)) - continue; - - LoadInst *L0 = (LoadInst *)I; - if (!L0->isSimple()) + LoadInst *L0 = dyn_cast<LoadInst>(I); + if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0)) continue; ++NLoads; @@ -414,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { } /// -/// \brief True when instruction is sink barrier for a store -/// -bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { - // FIXME: Conservatively let a load instruction block the store. - // Use alias analysis instead. - if (isa<LoadInst>(Inst)) - return true; - if (isa<CallInst>(Inst)) - return true; - if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) - return true; - // Note: mayHaveSideEffects covers all instructions that could - // trigger a change to state. Eg. in-flight stores have to be executed - // before ordered loads or fences, calls could invoke functions that store - // data to memory etc. - if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) { - return true; - } - DEBUG(dbgs() << "No Sink Barrier\n"); - return false; +/// \brief True when instruction is a sink barrier for a store +/// located in Loc +/// +/// Whenever an instruction could possibly read or modify the +/// value being stored or protect against the store from +/// happening it is considered a sink barrier. +/// + +bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start, + const Instruction& End, + AliasAnalysis::Location + Loc) { + return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Ref); } /// @@ -441,27 +411,28 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) { /// /// \return The store in \p when it is safe to sink. Otherwise return Null. /// -StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB, - StoreInst *SI) { - StoreInst *I = 0; - DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n"); - for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend(); +StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, + StoreInst *Store0) { + DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); + for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend(); RBI != RBE; ++RBI) { Instruction *Inst = &*RBI; - // Only move loads if they are used in the block. - if (isStoreSinkBarrier(Inst)) - break; - if (isa<StoreInst>(Inst)) { - AliasAnalysis::Location LocSI = AA->getLocation(SI); - AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst); - if (AA->isMustAlias(LocSI, LocInst)) { - I = (StoreInst *)Inst; - break; - } + if (!isa<StoreInst>(Inst)) + continue; + + StoreInst *Store1 = cast<StoreInst>(Inst); + BasicBlock *BB0 = Store0->getParent(); + + AliasAnalysis::Location Loc0 = AA->getLocation(Store0); + AliasAnalysis::Location Loc1 = AA->getLocation(Store1); + if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && + !isStoreSinkBarrierInRange(*Store1, BB1->back(), Loc1) && + !isStoreSinkBarrierInRange(*Store0, BB0->back(), Loc0)) { + return Store1; } } - return I; + return nullptr; } /// @@ -573,8 +544,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { Instruction *I = &*RBI; ++RBI; - if (isStoreSinkBarrier(I)) - break; + // Sink move non-simple (atomic, volatile) stores if (!isa<StoreInst>(I)) continue; @@ -611,8 +581,6 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { AA = &getAnalysis<AliasAnalysis>(); bool Changed = false; - if (!EnableMLSM) - return false; DEBUG(dbgs() << "Instruction Merger\n"); // Merge unconditional branches, allowing PRE to catch more @@ -622,7 +590,6 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { // Hoist equivalent loads and sink stores // outside diamonds when possible - // Run outside core GVN if (isDiamondHead(BB)) { Changed |= mergeLoads(BB); Changed |= mergeStores(getDiamondTail(BB)); diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 7cce89e0627e..5c8bed585b64 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -108,6 +108,10 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, if (Call->onlyReadsMemory()) return false; + // The call must have the expected result type. + if (!Call->getType()->isFloatingPointTy()) + return false; + // Do the following transformation: // // (before) diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index ea2cf7cf9b5f..4e022556f9cc 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -176,6 +176,7 @@ namespace { private: void BuildRankMap(Function &F); unsigned getRank(Value *V); + void canonicalizeOperands(Instruction *I); void ReassociateExpression(BinaryOperator *I); void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *OptimizeExpression(BinaryOperator *I, @@ -194,6 +195,7 @@ namespace { Value *RemoveFactorFromExpression(Value *V, Value *Factor); void EraseInst(Instruction *I); void OptimizeInst(Instruction *I); + Instruction *canonicalizeNegConstExpr(Instruction *I); }; } @@ -235,7 +237,20 @@ FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } /// opcode and if it only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { if (V->hasOneUse() && isa<Instruction>(V) && - cast<Instruction>(V)->getOpcode() == Opcode) + cast<Instruction>(V)->getOpcode() == Opcode && + (!isa<FPMathOperator>(V) || + cast<Instruction>(V)->hasUnsafeAlgebra())) + return cast<BinaryOperator>(V); + return nullptr; +} + +static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, + unsigned Opcode2) { + if (V->hasOneUse() && isa<Instruction>(V) && + (cast<Instruction>(V)->getOpcode() == Opcode1 || + cast<Instruction>(V)->getOpcode() == Opcode2) && + (!isa<FPMathOperator>(V) || + cast<Instruction>(V)->hasUnsafeAlgebra())) return cast<BinaryOperator>(V); return nullptr; } @@ -264,9 +279,11 @@ static bool isUnmovableInstruction(Instruction *I) { void Reassociate::BuildRankMap(Function &F) { unsigned i = 2; - // Assign distinct ranks to function arguments - for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) + // Assign distinct ranks to function arguments. + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { ValueRankMap[&*I] = ++i; + DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n"); + } ReversePostOrderTraversal<Function*> RPOT(&F); for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), @@ -304,24 +321,78 @@ unsigned Reassociate::getRank(Value *V) { // If this is a not or neg instruction, do not count it for rank. This // assures us that X and ~X will have the same rank. - if (!I->getType()->isIntegerTy() || - (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I))) + Type *Ty = V->getType(); + if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || + (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && + !BinaryOperator::isFNeg(I))) ++Rank; - //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " - // << Rank << "\n"); + DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n"); return ValueRankMap[I] = Rank; } +// Canonicalize constants to RHS. Otherwise, sort the operands by rank. +void Reassociate::canonicalizeOperands(Instruction *I) { + assert(isa<BinaryOperator>(I) && "Expected binary operator."); + assert(I->isCommutative() && "Expected commutative operator."); + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + unsigned LHSRank = getRank(LHS); + unsigned RHSRank = getRank(RHS); + + if (isa<Constant>(RHS)) + return; + + if (isa<Constant>(LHS) || RHSRank < LHSRank) + cast<BinaryOperator>(I)->swapOperands(); +} + +static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, + Instruction *InsertBefore, Value *FlagsOp) { + if (S1->getType()->isIntegerTy()) + return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); + else { + BinaryOperator *Res = + BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore); + Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags()); + return Res; + } +} + +static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, + Instruction *InsertBefore, Value *FlagsOp) { + if (S1->getType()->isIntegerTy()) + return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); + else { + BinaryOperator *Res = + BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore); + Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags()); + return Res; + } +} + +static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, + Instruction *InsertBefore, Value *FlagsOp) { + if (S1->getType()->isIntegerTy()) + return BinaryOperator::CreateNeg(S1, Name, InsertBefore); + else { + BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore); + Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags()); + return Res; + } +} + /// LowerNegateToMultiply - Replace 0-X with X*-1. /// static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { - Constant *Cst = Constant::getAllOnesValue(Neg->getType()); + Type *Ty = Neg->getType(); + Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty) + : ConstantFP::get(Ty, -1.0); - BinaryOperator *Res = - BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); - Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op. + BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg); + Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op. Res->takeName(Neg); Neg->replaceAllUsesWith(Res); Res->setDebugLoc(Neg->getDebugLoc()); @@ -377,13 +448,14 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { LHS = 0; // 1 + 1 === 0 modulo 2. return; } - if (Opcode == Instruction::Add) { + if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) { // TODO: Reduce the weight by exploiting nsw/nuw? LHS += RHS; return; } - assert(Opcode == Instruction::Mul && "Unknown associative operation!"); + assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) && + "Unknown associative operation!"); unsigned Bitwidth = LHS.getBitWidth(); // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth @@ -499,8 +571,7 @@ static bool LinearizeExprTree(BinaryOperator *I, DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); - assert(Instruction::isAssociative(Opcode) && - Instruction::isCommutative(Opcode) && + assert(I->isAssociative() && I->isCommutative() && "Expected an associative and commutative operation!"); // Visit all operands of the expression, keeping track of their weight (the @@ -515,7 +586,7 @@ static bool LinearizeExprTree(BinaryOperator *I, // ways to get to it. SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight) Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); - bool MadeChange = false; + bool Changed = false; // Leaves of the expression are values that either aren't the right kind of // operation (eg: a constant, or a multiply in an add tree), or are, but have @@ -552,7 +623,7 @@ static bool LinearizeExprTree(BinaryOperator *I, // If this is a binary operation of the right kind with only one use then // add its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { - assert(Visited.insert(Op) && "Not first visit!"); + assert(Visited.insert(Op).second && "Not first visit!"); DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); Worklist.push_back(std::make_pair(BO, Weight)); continue; @@ -562,7 +633,7 @@ static bool LinearizeExprTree(BinaryOperator *I, LeafMap::iterator It = Leaves.find(Op); if (It == Leaves.end()) { // Not in the leaf map. Must be the first time we saw this operand. - assert(Visited.insert(Op) && "Not first visit!"); + assert(Visited.insert(Op).second && "Not first visit!"); if (!Op->hasOneUse()) { // This value has uses not accounted for by the expression, so it is // not safe to modify. Mark it as being a leaf. @@ -584,7 +655,7 @@ static bool LinearizeExprTree(BinaryOperator *I, // exactly one such use, drop this new use of the leaf. assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); I->setOperand(OpIdx, UndefValue::get(I->getType())); - MadeChange = true; + Changed = true; // If the leaf is a binary operation of the right kind and we now see // that its multiple original uses were in fact all by nodes belonging @@ -613,21 +684,24 @@ static bool LinearizeExprTree(BinaryOperator *I, // expression. This means that it can safely be modified. See if we // can usefully morph it into an expression of the right kind. assert((!isa<Instruction>(Op) || - cast<Instruction>(Op)->getOpcode() != Opcode) && + cast<Instruction>(Op)->getOpcode() != Opcode + || (isa<FPMathOperator>(Op) && + !cast<Instruction>(Op)->hasUnsafeAlgebra())) && "Should have been handled above!"); assert(Op->hasOneUse() && "Has uses outside the expression tree!"); // If this is a multiply expression, turn any internal negations into // multiplies by -1 so they can be reassociated. - BinaryOperator *BO = dyn_cast<BinaryOperator>(Op); - if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) { - DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); - BO = LowerNegateToMultiply(BO); - DEBUG(dbgs() << *BO << 'n'); - Worklist.push_back(std::make_pair(BO, Weight)); - MadeChange = true; - continue; - } + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) + if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) || + (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) { + DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); + BO = LowerNegateToMultiply(BO); + DEBUG(dbgs() << *BO << '\n'); + Worklist.push_back(std::make_pair(BO, Weight)); + Changed = true; + continue; + } // Failed to morph into an expression of the right type. This really is // a leaf. @@ -665,7 +739,7 @@ static bool LinearizeExprTree(BinaryOperator *I, Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); } - return MadeChange; + return Changed; } // RewriteExprTree - Now that the operands for this expression tree are @@ -798,6 +872,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Constant *Undef = UndefValue::get(I->getType()); NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef, Undef, "", I); + if (NewOp->getType()->isFloatingPointTy()) + NewOp->setFastMathFlags(I->getFastMathFlags()); } else { NewOp = NodesToRewrite.pop_back_val(); } @@ -817,7 +893,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // expression tree is dominated by all of Ops. if (ExpressionChanged) do { - ExpressionChanged->clearSubclassOptionalData(); + // Preserve FastMathFlags. + if (isa<FPMathOperator>(I)) { + FastMathFlags Flags = I->getFastMathFlags(); + ExpressionChanged->clearSubclassOptionalData(); + ExpressionChanged->setFastMathFlags(Flags); + } else + ExpressionChanged->clearSubclassOptionalData(); + if (ExpressionChanged == I) break; ExpressionChanged->moveBefore(I); @@ -834,6 +917,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { + if (ConstantFP *C = dyn_cast<ConstantFP>(V)) + return ConstantExpr::getFNeg(C); if (Constant *C = dyn_cast<Constant>(V)) return ConstantExpr::getNeg(C); @@ -846,7 +931,8 @@ static Value *NegateValue(Value *V, Instruction *BI) { // the constants. We assume that instcombine will clean up the mess later if // we introduce tons of unnecessary negation instructions. // - if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) { + if (BinaryOperator *I = + isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { // Push the negates through the add. I->setOperand(0, NegateValue(I->getOperand(0), BI)); I->setOperand(1, NegateValue(I->getOperand(1), BI)); @@ -864,7 +950,8 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. for (User *U : V->users()) { - if (!BinaryOperator::isNeg(U)) continue; + if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U)) + continue; // We found one! Now we have to make sure that the definition dominates // this use. We do this by moving it to the entry block (if it is a @@ -894,27 +981,34 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. - return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI); + return CreateNeg(V, V->getName() + ".neg", BI, BI); } /// ShouldBreakUpSubtract - Return true if we should break up this subtract of /// X-Y into (X + -Y). static bool ShouldBreakUpSubtract(Instruction *Sub) { // If this is a negation, we can't split it up! - if (BinaryOperator::isNeg(Sub)) + if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub)) + return false; + + // Don't breakup X - undef. + if (isa<UndefValue>(Sub->getOperand(1))) return false; // Don't bother to break this up unless either the LHS is an associable add or // subtract or if this is only used by one. - if (isReassociableOp(Sub->getOperand(0), Instruction::Add) || - isReassociableOp(Sub->getOperand(0), Instruction::Sub)) + Value *V0 = Sub->getOperand(0); + if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) || + isReassociableOp(V0, Instruction::Sub, Instruction::FSub)) return true; - if (isReassociableOp(Sub->getOperand(1), Instruction::Add) || - isReassociableOp(Sub->getOperand(1), Instruction::Sub)) + Value *V1 = Sub->getOperand(1); + if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) || + isReassociableOp(V1, Instruction::Sub, Instruction::FSub)) return true; + Value *VB = Sub->user_back(); if (Sub->hasOneUse() && - (isReassociableOp(Sub->user_back(), Instruction::Add) || - isReassociableOp(Sub->user_back(), Instruction::Sub))) + (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) || + isReassociableOp(VB, Instruction::Sub, Instruction::FSub))) return true; return false; @@ -931,8 +1025,7 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub) { // and set it as the RHS of the add instruction we just made. // Value *NegVal = NegateValue(Sub->getOperand(1), Sub); - BinaryOperator *New = - BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub); + BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. New->takeName(Sub); @@ -956,8 +1049,19 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. Mul->takeName(Shl); + + // Everyone now refers to the mul instruction. Shl->replaceAllUsesWith(Mul); Mul->setDebugLoc(Shl->getDebugLoc()); + + // We can safely preserve the nuw flag in all cases. It's also safe to turn a + // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special + // handling. + bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap(); + bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap(); + if (NSW && NUW) + Mul->setHasNoSignedWrap(true); + Mul->setHasNoUnsignedWrap(NUW); return Mul; } @@ -969,13 +1073,23 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; unsigned e = Ops.size(); - for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) + for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { if (Ops[j].Op == X) return j; + if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op)) + if (Instruction *I2 = dyn_cast<Instruction>(X)) + if (I1->isIdenticalTo(I2)) + return j; + } // Scan backwards. - for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) + for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) { if (Ops[j].Op == X) return j; + if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op)) + if (Instruction *I2 = dyn_cast<Instruction>(X)) + if (I1->isIdenticalTo(I2)) + return j; + } return i; } @@ -988,15 +1102,16 @@ static Value *EmitAddTreeOfValues(Instruction *I, Value *V1 = Ops.back(); Ops.pop_back(); Value *V2 = EmitAddTreeOfValues(I, Ops); - return BinaryOperator::CreateAdd(V2, V1, "tmp", I); + return CreateAdd(V2, V1, "tmp", I, I); } /// RemoveFactorFromExpression - If V is an expression tree that is a /// multiplication sequence, and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { - BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); - if (!BO) return nullptr; + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); + if (!BO) + return nullptr; SmallVector<RepeatedValue, 8> Tree; MadeChange |= LinearizeExprTree(BO, Tree); @@ -1018,13 +1133,25 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } // If this is a negative version of this factor, remove it. - if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) + if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) { if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) if (FC1->getValue() == -FC2->getValue()) { FoundFactor = NeedsNegate = true; Factors.erase(Factors.begin()+i); break; } + } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) { + if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) { + APFloat F1(FC1->getValueAPF()); + APFloat F2(FC2->getValueAPF()); + F2.changeSign(); + if (F1.compare(F2) == APFloat::cmpEqual) { + FoundFactor = NeedsNegate = true; + Factors.erase(Factors.begin() + i); + break; + } + } + } } if (!FoundFactor) { @@ -1046,7 +1173,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } if (NeedsNegate) - V = BinaryOperator::CreateNeg(V, "neg", InsertPt); + V = CreateNeg(V, "neg", InsertPt, BO); return V; } @@ -1058,7 +1185,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { static void FindSingleUseMultiplyFactors(Value *V, SmallVectorImpl<Value*> &Factors, const SmallVectorImpl<ValueEntry> &Ops) { - BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) { Factors.push_back(V); return; @@ -1385,17 +1512,19 @@ Value *Reassociate::OptimizeAdd(Instruction *I, ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; // Insert a new multiply. - Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound); - Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I); + Type *Ty = TheOp->getType(); + Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound) + : ConstantFP::get(Ty, NumFound); + Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 - RedoInsts.insert(cast<Instruction>(Mul)); + RedoInsts.insert(Mul); // If every add operand was a duplicate, return the multiply. if (Ops.empty()) @@ -1412,11 +1541,12 @@ Value *Reassociate::OptimizeAdd(Instruction *I, } // Check for X and -X or X and ~X in the operand list. - if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp)) + if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) && + !BinaryOperator::isNot(TheOp)) continue; Value *X = nullptr; - if (BinaryOperator::isNeg(TheOp)) + if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)) X = BinaryOperator::getNegArgument(TheOp); else if (BinaryOperator::isNot(TheOp)) X = BinaryOperator::getNotArgument(TheOp); @@ -1426,7 +1556,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I, continue; // Remove X and -X from the operand list. - if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp)) + if (Ops.size() == 2 && + (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))) return Constant::getNullValue(X->getType()); // Remove X and ~X from the operand list. @@ -1463,7 +1594,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I, unsigned MaxOcc = 0; Value *MaxOccVal = nullptr; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + BinaryOperator *BOp = + isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); if (!BOp) continue; @@ -1476,40 +1608,65 @@ Value *Reassociate::OptimizeAdd(Instruction *I, SmallPtrSet<Value*, 8> Duplicates; for (unsigned i = 0, e = Factors.size(); i != e; ++i) { Value *Factor = Factors[i]; - if (!Duplicates.insert(Factor)) continue; + if (!Duplicates.insert(Factor).second) + continue; unsigned Occ = ++FactorOccurrences[Factor]; - if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + if (Occ > MaxOcc) { + MaxOcc = Occ; + MaxOccVal = Factor; + } // If Factor is a negative constant, add the negated value as a factor // because we can percolate the negate out. Watch for minint, which // cannot be positivified. - if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) + if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) { if (CI->isNegative() && !CI->isMinValue(true)) { Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); assert(!Duplicates.count(Factor) && "Shouldn't have two constant factors, missed a canonicalize"); - unsigned Occ = ++FactorOccurrences[Factor]; - if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + if (Occ > MaxOcc) { + MaxOcc = Occ; + MaxOccVal = Factor; + } + } + } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) { + if (CF->isNegative()) { + APFloat F(CF->getValueAPF()); + F.changeSign(); + Factor = ConstantFP::get(CF->getContext(), F); + assert(!Duplicates.count(Factor) && + "Shouldn't have two constant factors, missed a canonicalize"); + unsigned Occ = ++FactorOccurrences[Factor]; + if (Occ > MaxOcc) { + MaxOcc = Occ; + MaxOccVal = Factor; + } } + } } } // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { - DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); ++NumFactor; // Create a new instruction that uses the MaxOccVal twice. If we don't do // this, we could otherwise run into situations where removing a factor // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. - Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); + Instruction *DummyInst = + I->getType()->isIntegerTy() + ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) + : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); + SmallVector<WeakVH, 4> NewMulOps; for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. - BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + BinaryOperator *BOp = + isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul); if (!BOp) continue; @@ -1542,7 +1699,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, RedoInsts.insert(VI); // Create the multiply. - Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); + Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. @@ -1632,7 +1789,10 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder, Value *LHS = Ops.pop_back_val(); do { - LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); + if (LHS->getType()->isIntegerTy()) + LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); + else + LHS = Builder.CreateFMul(LHS, Ops.pop_back_val()); } while (!Ops.empty()); return LHS; @@ -1765,11 +1925,13 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, break; case Instruction::Add: + case Instruction::FAdd: if (Value *Result = OptimizeAdd(I, Ops)) return Result; break; case Instruction::Mul: + case Instruction::FMul: if (Value *Result = OptimizeMul(I, Ops)) return Result; break; @@ -1797,12 +1959,104 @@ void Reassociate::EraseInst(Instruction *I) { // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode && - Visited.insert(Op)) + Visited.insert(Op).second) Op = Op->user_back(); RedoInsts.insert(Op); } } +// Canonicalize expressions of the following form: +// x + (-Constant * y) -> x - (Constant * y) +// x - (-Constant * y) -> x + (Constant * y) +Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { + if (!I->hasOneUse() || I->getType()->isVectorTy()) + return nullptr; + + // Must be a mul, fmul, or fdiv instruction. + unsigned Opcode = I->getOpcode(); + if (Opcode != Instruction::Mul && Opcode != Instruction::FMul && + Opcode != Instruction::FDiv) + return nullptr; + + // Must have at least one constant operand. + Constant *C0 = dyn_cast<Constant>(I->getOperand(0)); + Constant *C1 = dyn_cast<Constant>(I->getOperand(1)); + if (!C0 && !C1) + return nullptr; + + // Must be a negative ConstantInt or ConstantFP. + Constant *C = C0 ? C0 : C1; + unsigned ConstIdx = C0 ? 0 : 1; + if (auto *CI = dyn_cast<ConstantInt>(C)) { + if (!CI->isNegative()) + return nullptr; + } else if (auto *CF = dyn_cast<ConstantFP>(C)) { + if (!CF->isNegative()) + return nullptr; + } else + return nullptr; + + // User must be a binary operator with one or more uses. + Instruction *User = I->user_back(); + if (!isa<BinaryOperator>(User) || !User->getNumUses()) + return nullptr; + + unsigned UserOpcode = User->getOpcode(); + if (UserOpcode != Instruction::Add && UserOpcode != Instruction::FAdd && + UserOpcode != Instruction::Sub && UserOpcode != Instruction::FSub) + return nullptr; + + // Subtraction is not commutative. Explicitly, the following transform is + // not valid: (-Constant * y) - x -> x + (Constant * y) + if (!User->isCommutative() && User->getOperand(1) != I) + return nullptr; + + // Change the sign of the constant. + if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) + I->setOperand(ConstIdx, ConstantInt::get(CI->getContext(), -CI->getValue())); + else { + ConstantFP *CF = cast<ConstantFP>(C); + APFloat Val = CF->getValueAPF(); + Val.changeSign(); + I->setOperand(ConstIdx, ConstantFP::get(CF->getContext(), Val)); + } + + // Canonicalize I to RHS to simplify the next bit of logic. E.g., + // ((-Const*y) + x) -> (x + (-Const*y)). + if (User->getOperand(0) == I && User->isCommutative()) + cast<BinaryOperator>(User)->swapOperands(); + + Value *Op0 = User->getOperand(0); + Value *Op1 = User->getOperand(1); + BinaryOperator *NI; + switch(UserOpcode) { + default: + llvm_unreachable("Unexpected Opcode!"); + case Instruction::Add: + NI = BinaryOperator::CreateSub(Op0, Op1); + break; + case Instruction::Sub: + NI = BinaryOperator::CreateAdd(Op0, Op1); + break; + case Instruction::FAdd: + NI = BinaryOperator::CreateFSub(Op0, Op1); + NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); + break; + case Instruction::FSub: + NI = BinaryOperator::CreateFAdd(Op0, Op1); + NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); + break; + } + + NI->insertBefore(User); + NI->setName(User->getName()); + User->replaceAllUsesWith(NI); + NI->setDebugLoc(I->getDebugLoc()); + RedoInsts.insert(I); + MadeChange = true; + return NI; +} + /// OptimizeInst - Inspect and optimize the given instruction. Note that erasing /// instructions is not allowed. void Reassociate::OptimizeInst(Instruction *I) { @@ -1810,8 +2064,7 @@ void Reassociate::OptimizeInst(Instruction *I) { if (!isa<BinaryOperator>(I)) return; - if (I->getOpcode() == Instruction::Shl && - isa<ConstantInt>(I->getOperand(1))) + if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1))) // If an operand of this shift is a reassociable multiply, or if the shift // is used by a reassociable multiply or add, turn into a multiply. if (isReassociableOp(I->getOperand(0), Instruction::Mul) || @@ -1824,29 +2077,23 @@ void Reassociate::OptimizeInst(Instruction *I) { I = NI; } - // Floating point binary operators are not associative, but we can still - // commute (some) of them, to canonicalize the order of their operands. - // This can potentially expose more CSE opportunities, and makes writing - // other transformations simpler. - if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) { - // FAdd and FMul can be commuted. - if (I->getOpcode() != Instruction::FMul && - I->getOpcode() != Instruction::FAdd) - return; + // Canonicalize negative constants out of expressions. + if (Instruction *Res = canonicalizeNegConstExpr(I)) + I = Res; - Value *LHS = I->getOperand(0); - Value *RHS = I->getOperand(1); - unsigned LHSRank = getRank(LHS); - unsigned RHSRank = getRank(RHS); + // Commute binary operators, to canonicalize the order of their operands. + // This can potentially expose more CSE opportunities, and makes writing other + // transformations simpler. + if (I->isCommutative()) + canonicalizeOperands(I); - // Sort the operands by rank. - if (RHSRank < LHSRank) { - I->setOperand(0, RHS); - I->setOperand(1, LHS); - } + // Don't optimize vector instructions. + if (I->getType()->isVectorTy()) + return; + // Don't optimize floating point instructions that don't have unsafe algebra. + if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra()) return; - } // Do not reassociate boolean (i1) expressions. We want to preserve the // original order of evaluation for short-circuited comparisons that @@ -1877,6 +2124,24 @@ void Reassociate::OptimizeInst(Instruction *I) { I = NI; } } + } else if (I->getOpcode() == Instruction::FSub) { + if (ShouldBreakUpSubtract(I)) { + Instruction *NI = BreakUpSubtract(I); + RedoInsts.insert(I); + MadeChange = true; + I = NI; + } else if (BinaryOperator::isFNeg(I)) { + // Otherwise, this is a negation. See if the operand is a multiply tree + // and if this is not an inner node of a multiply tree. + if (isReassociableOp(I->getOperand(1), Instruction::FMul) && + (!I->hasOneUse() || + !isReassociableOp(I->user_back(), Instruction::FMul))) { + Instruction *NI = LowerNegateToMultiply(I); + RedoInsts.insert(I); + MadeChange = true; + I = NI; + } + } } // If this instruction is an associative binary operator, process it. @@ -1894,11 +2159,16 @@ void Reassociate::OptimizeInst(Instruction *I) { if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub) return; + if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd && + cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub) + return; ReassociateExpression(BO); } void Reassociate::ReassociateExpression(BinaryOperator *I) { + assert(!I->getType()->isVectorTy() && + "Reassociation of vector instructions is not supported."); // First, walk the expression tree, linearizing the tree, collecting the // operand information. @@ -1943,12 +2213,21 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { // this is a multiply tree used only by an add, and the immediate is a -1. // In this case we reassociate to put the negation on the outside so that we // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y - if (I->getOpcode() == Instruction::Mul && I->hasOneUse() && - cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && - isa<ConstantInt>(Ops.back().Op) && - cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { - ValueEntry Tmp = Ops.pop_back_val(); - Ops.insert(Ops.begin(), Tmp); + if (I->hasOneUse()) { + if (I->getOpcode() == Instruction::Mul && + cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && + isa<ConstantInt>(Ops.back().Op) && + cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { + ValueEntry Tmp = Ops.pop_back_val(); + Ops.insert(Ops.begin(), Tmp); + } else if (I->getOpcode() == Instruction::FMul && + cast<Instruction>(I->user_back())->getOpcode() == + Instruction::FAdd && + isa<ConstantFP>(Ops.back().Op) && + cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) { + ValueEntry Tmp = Ops.pop_back_val(); + Ops.insert(Ops.begin(), Tmp); + } } DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index b6023e2ce789..1b46727c17bb 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) { // Insert all new allocas into entry block. BasicBlock *BBEntry = &F.getEntryBlock(); - assert(pred_begin(BBEntry) == pred_end(BBEntry) && + assert(pred_empty(BBEntry) && "Entry block to function must not have predecessors!"); // Find first non-alloca instruction and create insertion point. This is diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 90c3520c8323..cfc9a8e89fa0 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -214,7 +214,8 @@ public: /// /// This returns true if the block was not considered live before. bool MarkBlockExecutable(BasicBlock *BB) { - if (!BBExecutable.insert(BB)) return false; + if (!BBExecutable.insert(BB).second) + return false; DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); BBWorkList.push_back(BB); // Add the block to the work list! return true; @@ -1010,7 +1011,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { } Constant *Ptr = Operands[0]; - ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end()); + auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices)); } @@ -1107,6 +1108,9 @@ CallOverdefined: Operands.push_back(State.getConstant()); } + if (getValueState(I).isOverdefined()) + return; + // If we can constant fold this, mark the result of the call as a // constant. if (Constant *C = ConstantFoldCall(F, Operands, TLI)) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index f902eb23cbcf..ed161fd4af3e 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -78,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates"); /// Hidden option to force the pass to not use DomTree and mem2reg, instead /// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> -ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); +static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), + cl::Hidden); /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. @@ -88,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", /// Hidden option to experiment with completely strict handling of inbounds /// GEPs. -static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", - cl::init(false), cl::Hidden); +static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), + cl::Hidden); namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. template <bool preserveNames = true> -class IRBuilderPrefixedInserter : - public IRBuilderDefaultInserter<preserveNames> { +class IRBuilderPrefixedInserter + : public IRBuilderDefaultInserter<preserveNames> { std::string Prefix; public: @@ -112,19 +113,19 @@ protected: // Specialization for not preserving the name is trivial. template <> -class IRBuilderPrefixedInserter<false> : - public IRBuilderDefaultInserter<false> { +class IRBuilderPrefixedInserter<false> + : public IRBuilderDefaultInserter<false> { public: void SetNamePrefix(const Twine &P) {} }; /// \brief Provide a typedef for IRBuilder that drops names in release builds. #ifndef NDEBUG -typedef llvm::IRBuilder<true, ConstantFolder, - IRBuilderPrefixedInserter<true> > IRBuilderTy; +typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>> + IRBuilderTy; #else -typedef llvm::IRBuilder<false, ConstantFolder, - IRBuilderPrefixedInserter<false> > IRBuilderTy; +typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>> + IRBuilderTy; #endif } @@ -170,10 +171,14 @@ public: /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. bool operator<(const Slice &RHS) const { - if (beginOffset() < RHS.beginOffset()) return true; - if (beginOffset() > RHS.beginOffset()) return false; - if (isSplittable() != RHS.isSplittable()) return !isSplittable(); - if (endOffset() > RHS.endOffset()) return true; + if (beginOffset() < RHS.beginOffset()) + return true; + if (beginOffset() > RHS.beginOffset()) + return false; + if (isSplittable() != RHS.isSplittable()) + return !isSplittable(); + if (endOffset() > RHS.endOffset()) + return true; return false; } @@ -197,9 +202,7 @@ public: namespace llvm { template <typename T> struct isPodLike; -template <> struct isPodLike<Slice> { - static const bool value = true; -}; +template <> struct isPodLike<Slice> { static const bool value = true; }; } namespace { @@ -224,36 +227,318 @@ public: /// \brief Support for iterating over the slices. /// @{ typedef SmallVectorImpl<Slice>::iterator iterator; + typedef iterator_range<iterator> range; iterator begin() { return Slices.begin(); } iterator end() { return Slices.end(); } typedef SmallVectorImpl<Slice>::const_iterator const_iterator; + typedef iterator_range<const_iterator> const_range; const_iterator begin() const { return Slices.begin(); } const_iterator end() const { return Slices.end(); } /// @} - /// \brief Allow iterating the dead users for this alloca. + /// \brief Erase a range of slices. + void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } + + /// \brief Insert new slices for this alloca. /// - /// These are instructions which will never actually use the alloca as they - /// are outside the allocated range. They are safe to replace with undef and - /// delete. - /// @{ - typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; - dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } - dead_user_iterator dead_user_end() const { return DeadUsers.end(); } - /// @} + /// This moves the slices into the alloca's slices collection, and re-sorts + /// everything so that the usual ordering properties of the alloca's slices + /// hold. + void insert(ArrayRef<Slice> NewSlices) { + int OldSize = Slices.size(); + std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + auto SliceI = Slices.begin() + OldSize; + std::sort(SliceI, Slices.end()); + std::inplace_merge(Slices.begin(), SliceI, Slices.end()); + } + + // Forward declare an iterator to befriend it. + class partition_iterator; + + /// \brief A partition of the slices. + /// + /// An ephemeral representation for a range of slices which can be viewed as + /// a partition of the alloca. This range represents a span of the alloca's + /// memory which cannot be split, and provides access to all of the slices + /// overlapping some part of the partition. + /// + /// Objects of this type are produced by traversing the alloca's slices, but + /// are only ephemeral and not persistent. + class Partition { + private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + /// \brief The begining and ending offsets of the alloca for this partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + + public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } - /// \brief Allow iterating the dead expressions referring to this alloca. + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } + }; + + /// \brief An iterator over partitions of the alloca's slices. + /// + /// This iterator implements the core algorithm for partitioning the alloca's + /// slices. It is a forward iterator as we don't support backtracking for + /// efficiency reasons, and re-use a single storage area to maintain the + /// current set of split slices. + /// + /// It is templated on the slice iterator type to use so that it can operate + /// with either const or non-const slice iterators. + class partition_iterator + : public iterator_facade_base<partition_iterator, + std::forward_iterator_tag, Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // parttion is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + + public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptyness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && + P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } + }; + + /// \brief A forward range over the partitions of the alloca's slices. + /// + /// This accesses an iterator range over the partitions of the alloca's + /// slices. It computes these partitions on the fly based on the overlapping + /// offsets of the slices and the ability to split them. It will visit "empty" + /// partitions to cover regions of the alloca only accessed via split + /// slices. + iterator_range<partition_iterator> partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); + } + + /// \brief Access the dead users for this alloca. + ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } + + /// \brief Access the dead operands referring to this alloca. /// /// These are operands which have cannot actually be used to refer to the /// alloca as they are outside its range and the user doesn't correct for /// that. These mostly consist of PHI node inputs and the like which we just /// need to replace with undef. - /// @{ - typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; - dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } - dead_op_iterator dead_op_end() const { return DeadOperands.end(); } - /// @} + ArrayRef<Use *> getDeadOperands() const { return DeadOperands; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; @@ -317,13 +602,22 @@ static Value *foldSelectInst(SelectInst &SI) { // being selected between, fold the select. Yes this does (rarely) happen // early on. if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) - return SI.getOperand(1+CI->isZero()); + return SI.getOperand(1 + CI->isZero()); if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); return nullptr; } +/// \brief A helper that folds a PHI node or a select. +static Value *foldPHINodeOrSelectInst(Instruction &I) { + if (PHINode *PN = dyn_cast<PHINode>(&I)) { + // If PN merges together the same value, return that value. + return PN->hasConstantValue(); + } + return foldSelectInst(cast<SelectInst>(I)); +} + /// \brief Builder for the alloca slices. /// /// This class builds a set of alloca slices by recursively visiting the uses @@ -334,7 +628,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { typedef PtrUseVisitor<SliceBuilder> Base; const uint64_t AllocSize; - AllocaSlices &S; + AllocaSlices &AS; SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; @@ -343,14 +637,14 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { SmallPtrSet<Instruction *, 4> VisitedDeadInsts; public: - SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S) + SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) : PtrUseVisitor<SliceBuilder>(DL), - AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {} + AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {} private: void markAsDead(Instruction &I) { - if (VisitedDeadInsts.insert(&I)) - S.DeadUsers.push_back(&I); + if (VisitedDeadInsts.insert(&I).second) + AS.DeadUsers.push_back(&I); } void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, @@ -361,7 +655,7 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" - << " alloca: " << S.AI << "\n" + << " alloca: " << AS.AI << "\n" << " use: " << I << "\n"); return markAsDead(I); } @@ -379,12 +673,12 @@ private: if (Size > AllocSize - BeginOffset) { DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset << " to remain within the " << AllocSize << " byte alloca:\n" - << " alloca: " << S.AI << "\n" + << " alloca: " << AS.AI << "\n" << " use: " << I << "\n"); EndOffset = AllocSize; } - S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); + AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); } void visitBitCastInst(BitCastInst &BC) { @@ -421,7 +715,8 @@ private: GEPOffset += APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); } else { - // For array or vector indices, scale the index by the size of the type. + // For array or vector indices, scale the index by the size of the + // type. APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); GEPOffset += Index * APInt(Offset.getBitWidth(), DL.getTypeAllocSize(GTI.getIndexedType())); @@ -440,16 +735,10 @@ private: void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { - // We allow splitting of loads and stores where the type is an integer type - // and cover the entire alloca. This prevents us from splitting over - // eagerly. - // FIXME: In the great blue eventually, we should eagerly split all integer - // loads and stores, and then have a separate step that merges adjacent - // alloca partitions into a single partition suitable for integer widening. - // Or we should skip the merge step and rely on GVN and other passes to - // merge adjacent loads and stores that survive mem2reg. - bool IsSplittable = - Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; + // We allow splitting of non-volatile loads and stores where the type is an + // integer type. These may be used to implement 'memcpy' or other "transfer + // of bits" patterns. + bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } @@ -485,7 +774,7 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" - << " alloca: " << S.AI << "\n" + << " alloca: " << AS.AI << "\n" << " use: " << SI << "\n"); return markAsDead(SI); } @@ -495,7 +784,6 @@ private: handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); } - void visitMemSetInst(MemSetInst &II) { assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); @@ -507,9 +795,8 @@ private: if (!IsOffsetKnown) return PI.setAborted(&II); - insertUse(II, Offset, - Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(), + insertUse(II, Offset, Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), (bool)Length); } @@ -533,15 +820,15 @@ private: // FIXME: Yet another place we really should bypass this when // instrumenting for ASan. if (Offset.uge(AllocSize)) { - SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); + SmallDenseMap<Instruction *, unsigned>::iterator MTPI = + MemTransferSliceMap.find(&II); if (MTPI != MemTransferSliceMap.end()) - S.Slices[MTPI->second].kill(); + AS.Slices[MTPI->second].kill(); return markAsDead(II); } uint64_t RawOffset = Offset.getLimitedValue(); - uint64_t Size = Length ? Length->getLimitedValue() - : AllocSize - RawOffset; + uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; // Check for the special case where the same exact value is used for both // source and dest. @@ -558,10 +845,10 @@ private: bool Inserted; SmallDenseMap<Instruction *, unsigned>::iterator MTPI; std::tie(MTPI, Inserted) = - MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); + MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size())); unsigned PrevIdx = MTPI->second; if (!Inserted) { - Slice &PrevP = S.Slices[PrevIdx]; + Slice &PrevP = AS.Slices[PrevIdx]; // Check if the begin offsets match and this is a non-volatile transfer. // In that case, we can completely elide the transfer. @@ -579,7 +866,7 @@ private: insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); // Check that we ended up with a valid index in the map. - assert(S.Slices[PrevIdx].getUse()->getUser() == &II && + assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && "Map index doesn't point back to a slice with this user."); } @@ -639,64 +926,47 @@ private: } for (User *U : I->users()) - if (Visited.insert(cast<Instruction>(U))) + if (Visited.insert(cast<Instruction>(U)).second) Uses.push_back(std::make_pair(I, cast<Instruction>(U))); } while (!Uses.empty()); return nullptr; } - void visitPHINode(PHINode &PN) { - if (PN.use_empty()) - return markAsDead(PN); - if (!IsOffsetKnown) - return PI.setAborted(&PN); - - // See if we already have computed info on this node. - uint64_t &PHISize = PHIOrSelectSizes[&PN]; - if (!PHISize) { - // This is a new PHI node, check for an unsafe use of the PHI node. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize)) - return PI.setAborted(UnsafeI); - } - - // For PHI and select operands outside the alloca, we can't nuke the entire - // phi or select -- the other side might still be relevant, so we special - // case them here and use a separate structure to track the operands - // themselves which should be replaced with undef. - // FIXME: This should instead be escaped in the event we're instrumenting - // for address sanitization. - if (Offset.uge(AllocSize)) { - S.DeadOperands.push_back(U); - return; - } - - insertUse(PN, Offset, PHISize); - } + void visitPHINodeOrSelectInst(Instruction &I) { + assert(isa<PHINode>(I) || isa<SelectInst>(I)); + if (I.use_empty()) + return markAsDead(I); - void visitSelectInst(SelectInst &SI) { - if (SI.use_empty()) - return markAsDead(SI); - if (Value *Result = foldSelectInst(SI)) { + // TODO: We could use SimplifyInstruction here to fold PHINodes and + // SelectInsts. However, doing so requires to change the current + // dead-operand-tracking mechanism. For instance, suppose neither loading + // from %U nor %other traps. Then "load (select undef, %U, %other)" does not + // trap either. However, if we simply replace %U with undef using the + // current dead-operand-tracking mechanism, "load (select undef, undef, + // %other)" may trap because the select may return the first operand + // "undef". + if (Value *Result = foldPHINodeOrSelectInst(I)) { if (Result == *U) // If the result of the constant fold will be the pointer, recurse - // through the select as if we had RAUW'ed it. - enqueueUsers(SI); + // through the PHI/select as if we had RAUW'ed it. + enqueueUsers(I); else - // Otherwise the operand to the select is dead, and we can replace it - // with undef. - S.DeadOperands.push_back(U); + // Otherwise the operand to the PHI/select is dead, and we can replace + // it with undef. + AS.DeadOperands.push_back(U); return; } + if (!IsOffsetKnown) - return PI.setAborted(&SI); + return PI.setAborted(&I); // See if we already have computed info on this node. - uint64_t &SelectSize = PHIOrSelectSizes[&SI]; - if (!SelectSize) { - // This is a new Select, check for an unsafe use of it. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize)) + uint64_t &Size = PHIOrSelectSizes[&I]; + if (!Size) { + // This is a new PHI/Select, check for an unsafe use of it. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) return PI.setAborted(UnsafeI); } @@ -707,17 +977,19 @@ private: // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. if (Offset.uge(AllocSize)) { - S.DeadOperands.push_back(U); + AS.DeadOperands.push_back(U); return; } - insertUse(SI, Offset, SelectSize); + insertUse(I, Offset, Size); } + void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } + + void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } + /// \brief Disable SROA entirely if there are unhandled users of the alloca. - void visitInstruction(Instruction &I) { - PI.setAborted(&I); - } + void visitInstruction(Instruction &I) { PI.setAborted(&I); } }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) @@ -738,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) } Slices.erase(std::remove_if(Slices.begin(), Slices.end(), - std::mem_fun_ref(&Slice::isDead)), + [](const Slice &S) { + return S.isDead(); + }), Slices.end()); #if __cplusplus >= 201103L && !defined(NDEBUG) @@ -758,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); + OS << "\n"; printUse(OS, I, Indent); } @@ -765,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) - << (I->isSplittable() ? " (splittable)" : "") << "\n"; + << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, @@ -813,15 +1088,17 @@ public: AllocaInst &AI, DIBuilder &DIB) : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - void run(const SmallVectorImpl<Instruction*> &Insts) { + void run(const SmallVectorImpl<Instruction *> &Insts) { // Retain the debug information attached to the alloca for use when // rewriting loads and stores. - if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(&AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -834,8 +1111,9 @@ public: DVIs.pop_back_val()->eraseFromParent(); } - bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const override { + bool + isInstInList(Instruction *I, + const SmallVectorImpl<Instruction *> &Insts) const override { Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) Ptr = LI->getOperand(0); @@ -857,23 +1135,18 @@ public: else return false; - } while (Visited.insert(Ptr)); + } while (Visited.insert(Ptr).second); return false; } void updateDebugInfo(Instruction *Inst) const override { - for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), - E = DDIs.end(); I != E; ++I) { - DbgDeclareInst *DDI = *I; + for (DbgDeclareInst *DDI : DDIs) if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, SI, DIB); else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - } - for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), - E = DVIs.end(); I != E; ++I) { - DbgValueInst *DVI = *I; + for (DbgValueInst *DVI : DVIs) { Value *Arg = nullptr; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // If an argument is zero extended then use argument directly. The ZExt @@ -890,15 +1163,14 @@ public: continue; } Instruction *DbgVal = - DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), - Inst); + DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), + DIExpression(DVI->getExpression()), Inst); DbgVal->setDebugLoc(DVI->getDebugLoc()); } } }; } // end anon namespace - namespace { /// \brief An optimization pass providing Scalar Replacement of Aggregates. /// @@ -924,6 +1196,7 @@ class SROA : public FunctionPass { LLVMContext *C; const DataLayout *DL; DominatorTree *DT; + AssumptionCache *AC; /// \brief Worklist of alloca instructions to simplify. /// @@ -932,12 +1205,12 @@ class SROA : public FunctionPass { /// directly promoted. Finally, each time we rewrite a use of an alloca other /// the one being actively rewritten, we add it back onto the list if not /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; /// \brief A collection of instructions to delete. /// We try to batch deletions to simplify code and make things a bit more /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; /// \brief Post-promotion worklist. /// @@ -947,7 +1220,7 @@ class SROA : public FunctionPass { /// /// Note that we have to be very careful to clear allocas out of this list in /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; /// \brief A collection of alloca instructions we can directly promote. std::vector<AllocaInst *> PromotableAllocas; @@ -957,7 +1230,7 @@ class SROA : public FunctionPass { /// All of these PHIs have been checked for the safety of speculation and by /// being speculated will allow promoting allocas currently in the promotable /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; + SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; /// \brief A worklist of select instructions to speculate prior to promoting /// allocas. @@ -965,12 +1238,12 @@ class SROA : public FunctionPass { /// All of these select instructions have been checked for the safety of /// speculation and by being speculated will allow promoting allocas /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; + SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; public: SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(nullptr), DL(nullptr), DT(nullptr) { + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), + DL(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -983,14 +1256,13 @@ private: friend class PHIOrSelectSpeculator; friend class AllocaSliceRewriter; - bool rewritePartition(AllocaInst &AI, AllocaSlices &S, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses); - bool splitAlloca(AllocaInst &AI, AllocaSlices &S); + bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); + bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P); + bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); bool runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); - void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); + void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); bool promoteAllocas(Function &F); }; } @@ -1001,11 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { return new SROA(RequiresDomTree); } -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", - false, false) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, + false) /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. @@ -1076,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, - const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1148,10 +1420,12 @@ static void speculatePHINodeLoads(PHINode &PN) { PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), PN.getName() + ".sroa.speculated"); - // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // Get the AA tags and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ. LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); - MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + + AAMDNodes AATags; + SomeLoad->getAAMetadata(AATags); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. @@ -1172,8 +1446,8 @@ static void speculatePHINodeLoads(PHINode &PN) { InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); ++NumLoadsSpeculated; Load->setAlignment(Align); - if (TBAATag) - Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + if (AATags) + Load->setAAMetadata(AATags); NewPN->addIncoming(Load, Pred); } @@ -1238,12 +1512,15 @@ static void speculateSelectInstLoads(SelectInst &SI) { IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); NumLoadsSpeculated += 2; - // Transfer alignment and TBAA info if present. + // Transfer alignment and AA info if present. TL->setAlignment(LI->getAlignment()); FL->setAlignment(LI->getAlignment()); - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { - TL->setMetadata(LLVMContext::MD_tbaa, Tag); - FL->setMetadata(LLVMContext::MD_tbaa, Tag); + + AAMDNodes Tags; + LI->getAAMetadata(Tags); + if (Tags) { + TL->setAAMetadata(Tags); + FL->setAAMetadata(Tags); } Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, @@ -1332,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { if (Offset == 0) - return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, + NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) @@ -1440,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, - APInt Offset, Type *PointerTy, - Twine NamePrefix) { + APInt Offset, Type *PointerTy, Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet<Value *, 4> Visited; @@ -1450,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll - // fall back to it, so keep it around here. + // fall back to it, so keep it and the base it was computed from around here. Value *OffsetPtr = nullptr; + Value *OffsetBasePtr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. @@ -1468,7 +1746,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, break; Offset += GEPOffset; Ptr = GEP->getPointerOperand(); - if (!Visited.insert(Ptr)) + if (!Visited.insert(Ptr).second) break; } @@ -1476,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices, NamePrefix)) { - if (P->getType() == PointerTy) { - // Zap any offset pointer that we ended up computing in previous rounds. - if (OffsetPtr && OffsetPtr->use_empty()) - if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) - I->eraseFromParent(); + // If we have a new natural pointer at the offset, clear out any old + // offset pointer we computed. Unless it is the base pointer or + // a non-instruction, we built a GEP we don't need. Zap it. + if (OffsetPtr && OffsetPtr != OffsetBasePtr) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) { + assert(I->use_empty() && "Built a GEP with uses some how!"); + I->eraseFromParent(); + } + OffsetPtr = P; + OffsetBasePtr = Ptr; + // If we also found a pointer of the right type, we're done. + if (P->getType() == PointerTy) return P; - } - if (!OffsetPtr) { - OffsetPtr = P; - } } // Stash this pointer if we've found an i8*. @@ -1505,7 +1786,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, break; } assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); - } while (Visited.insert(Ptr)); + } while (Visited.insert(Ptr).second); if (!OffsetPtr) { if (!Int8Ptr) { @@ -1515,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Int8PtrOffset = Offset; } - OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : - IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), - NamePrefix + "sroa_raw_idx"); + OffsetPtr = Int8PtrOffset == 0 + ? Int8Ptr + : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; @@ -1528,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } +/// \brief Compute the adjusted alignment for a load or store from an offset. +static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + unsigned Alignment; + Type *Ty; + if (auto *LI = dyn_cast<LoadInst>(I)) { + Alignment = LI->getAlignment(); + Ty = LI->getType(); + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + Alignment = SI->getAlignment(); + Ty = SI->getValueOperand()->getType(); + } else { + llvm_unreachable("Only loads and stores are allowed!"); + } + + if (!Alignment) + Alignment = DL.getABITypeAlignment(Ty); + + return MinAlign(Alignment, Offset); +} + /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to @@ -1621,39 +1924,43 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// /// This function is called to test each entry in a partioning which is slated /// for a single slice. -static bool isVectorPromotionViableForSlice( - const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset, - uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize, - AllocaSlices::const_iterator I) { +static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, + const Slice &S, VectorType *Ty, + uint64_t ElementSize, + const DataLayout &DL) { // First validate the slice offsets. uint64_t BeginOffset = - std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset; + std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); uint64_t BeginIndex = BeginOffset / ElementSize; if (BeginIndex * ElementSize != BeginOffset || BeginIndex >= Ty->getNumElements()) return false; uint64_t EndOffset = - std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset; + std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); uint64_t EndIndex = EndOffset / ElementSize; if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) return false; assert(EndIndex > BeginIndex && "Empty vector!"); uint64_t NumElements = EndIndex - BeginIndex; - Type *SliceTy = - (NumElements == 1) ? Ty->getElementType() - : VectorType::get(Ty->getElementType(), NumElements); + Type *SliceTy = (NumElements == 1) + ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); Type *SplitIntTy = Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); - Use *U = I->getUse(); + Use *U = S.getUse(); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { if (MI->isVolatile()) return false; - if (!I->isSplittable()) + if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { // Disable vector promotion when there are loads or stores of an FCA. return false; @@ -1661,8 +1968,7 @@ static bool isVectorPromotionViableForSlice( if (LI->isVolatile()) return false; Type *LTy = LI->getType(); - if (SliceBeginOffset > I->beginOffset() || - SliceEndOffset < I->endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(LTy->isIntegerTy()); LTy = SplitIntTy; } @@ -1672,8 +1978,7 @@ static bool isVectorPromotionViableForSlice( if (SI->isVolatile()) return false; Type *STy = SI->getValueOperand()->getType(); - if (SliceBeginOffset > I->beginOffset() || - SliceEndOffset < I->endOffset()) { + if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(STy->isIntegerTy()); STy = SplitIntTy; } @@ -1695,65 +2000,137 @@ static bool isVectorPromotionViableForSlice( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static bool -isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S, - uint64_t SliceBeginOffset, uint64_t SliceEndOffset, - AllocaSlices::const_iterator I, - AllocaSlices::const_iterator E, - ArrayRef<AllocaSlices::iterator> SplitUses) { - VectorType *Ty = dyn_cast<VectorType>(AllocaTy); - if (!Ty) - return false; +static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, + const DataLayout &DL) { + // Collect the candidate types for vector-based promotion. Also track whether + // we have different element types. + SmallVector<VectorType *, 4> CandidateTys; + Type *CommonEltTy = nullptr; + bool HaveCommonEltTy = true; + auto CheckCandidateType = [&](Type *Ty) { + if (auto *VTy = dyn_cast<VectorType>(Ty)) { + CandidateTys.push_back(VTy); + if (!CommonEltTy) + CommonEltTy = VTy->getElementType(); + else if (CommonEltTy != VTy->getElementType()) + HaveCommonEltTy = false; + } + }; + // Consider any loads or stores that are the exact size of the slice. + for (const Slice &S : P) + if (S.beginOffset() == P.beginOffset() && + S.endOffset() == P.endOffset()) { + if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) + CheckCandidateType(LI->getType()); + else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) + CheckCandidateType(SI->getValueOperand()->getType()); + } - uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType()); + // If we didn't find a vector type, nothing to do here. + if (CandidateTys.empty()) + return nullptr; - // While the definition of LLVM vectors is bitpacked, we don't support sizes - // that aren't byte sized. - if (ElementSize % 8) - return false; - assert((DL.getTypeSizeInBits(Ty) % 8) == 0 && - "vector size not a multiple of element size?"); - ElementSize /= 8; + // Remove non-integer vector types if we had multiple common element types. + // FIXME: It'd be nice to replace them with integer vector types, but we can't + // do that until all the backends are known to produce good code for all + // integer vector types. + if (!HaveCommonEltTy) { + CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(), + [](VectorType *VTy) { + return !VTy->getElementType()->isIntegerTy(); + }), + CandidateTys.end()); + + // If there were no integer vector types, give up. + if (CandidateTys.empty()) + return nullptr; - for (; I != E; ++I) - if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, - SliceEndOffset, Ty, ElementSize, I)) - return false; + // Rank the remaining candidate vector types. This is easy because we know + // they're all integer vectors. We sort by ascending number of elements. + auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { + assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) && + "Cannot have vector types of different sizes!"); + assert(RHSTy->getElementType()->isIntegerTy() && + "All non-integer types eliminated!"); + assert(LHSTy->getElementType()->isIntegerTy() && + "All non-integer types eliminated!"); + return RHSTy->getNumElements() < LHSTy->getNumElements(); + }; + std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes); + CandidateTys.erase( + std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), + CandidateTys.end()); + } else { +// The only way to have the same element type in every vector type is to +// have the same vector type. Check that and remove all but one. +#ifndef NDEBUG + for (VectorType *VTy : CandidateTys) { + assert(VTy->getElementType() == CommonEltTy && + "Unaccounted for element type!"); + assert(VTy == CandidateTys[0] && + "Different vector types with the same element type!"); + } +#endif + CandidateTys.resize(1); + } - for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), - SUE = SplitUses.end(); - SUI != SUE; ++SUI) - if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, - SliceEndOffset, Ty, ElementSize, *SUI)) + // Try each vector type, and return the one which works. + auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { + uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()); + + // While the definition of LLVM vectors is bitpacked, we don't support sizes + // that aren't byte sized. + if (ElementSize % 8) return false; + assert((DL.getTypeSizeInBits(VTy) % 8) == 0 && + "vector size not a multiple of element size?"); + ElementSize /= 8; - return true; + for (const Slice &S : P) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + return false; + + for (const Slice *S : P.splitSliceTails()) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + return false; + + return true; + }; + for (VectorType *VTy : CandidateTys) + if (CheckVectorTypeForPromotion(VTy)) + return VTy; + + return nullptr; } /// \brief Test whether a slice of an alloca is valid for integer widening. /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. -static bool isIntegerWideningViableForSlice(const DataLayout &DL, - Type *AllocaTy, +static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, - uint64_t Size, AllocaSlices &S, - AllocaSlices::const_iterator I, + Type *AllocaTy, + const DataLayout &DL, bool &WholeAllocaOp) { - uint64_t RelBegin = I->beginOffset() - AllocBeginOffset; - uint64_t RelEnd = I->endOffset() - AllocBeginOffset; + uint64_t Size = DL.getTypeStoreSize(AllocaTy); + + uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; + uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past // the end of the aloca's type and into its padding. if (RelEnd > Size) return false; - Use *U = I->getUse(); + Use *U = S.getUse(); if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { if (LI->isVolatile()) return false; - if (RelBegin == 0 && RelEnd == Size) + // Note that we don't count vector loads or stores as whole-alloca + // operations which enable integer widening because we would prefer to use + // vector widening instead. + if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) @@ -1768,7 +2145,10 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL, Type *ValueTy = SI->getValueOperand()->getType(); if (SI->isVolatile()) return false; - if (RelBegin == 0 && RelEnd == Size) + // Note that we don't count vector loads or stores as whole-alloca + // operations which enable integer widening because we would prefer to use + // vector widening instead. + if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) @@ -1782,7 +2162,7 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL, } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { if (MI->isVolatile() || !isa<Constant>(MI->getLength())) return false; - if (!I->isSplittable()) + if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && @@ -1801,12 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool -isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, - uint64_t AllocBeginOffset, AllocaSlices &S, - AllocaSlices::const_iterator I, - AllocaSlices::const_iterator E, - ArrayRef<AllocaSlices::iterator> SplitUses) { +static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, + const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) @@ -1824,25 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, !canConvertValue(DL, IntTy, AllocaTy)) return false; - uint64_t Size = DL.getTypeStoreSize(AllocaTy); - // While examining uses, we ensure that the alloca has a covering load or // store. We don't want to widen the integer operations only to fail to // promote due to some other unsplittable entry (which we may make splittable // later). However, if there are only splittable uses, go ahead and assume // that we cover the alloca. - bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits); - - for (; I != E; ++I) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - S, I, WholeAllocaOp)) + // FIXME: We shouldn't consider split slices that happen to start in the + // partition here... + bool WholeAllocaOp = + P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits); + + for (const Slice &S : P) + if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; - for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), - SUE = SplitUses.end(); - SUI != SUE; ++SUI) - if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, - S, *SUI, WholeAllocaOp)) + for (const Slice *S : P.splitSliceTails()) + if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, + WholeAllocaOp)) return false; return WholeAllocaOp; @@ -1855,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *IntTy = cast<IntegerType>(V->getType()); assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element extends past full value"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1884,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, } assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); - uint64_t ShAmt = 8*Offset; + uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) - ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateShl(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); @@ -1902,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, return V; } -static Value *extractVector(IRBuilderTy &IRB, Value *V, - unsigned BeginIndex, unsigned EndIndex, - const Twine &Name) { +static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, + unsigned EndIndex, const Twine &Name) { VectorType *VecTy = cast<VectorType>(V->getType()); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); @@ -1919,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, return V; } - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(NumElements); for (unsigned i = BeginIndex; i != EndIndex; ++i) Mask.push_back(IRB.getInt32(i)); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".extract"); + ConstantVector::get(Mask), Name + ".extract"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; } @@ -1940,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // Single element to insert. V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), Name + ".insert"); - DEBUG(dbgs() << " insert: " << *V << "\n"); + DEBUG(dbgs() << " insert: " << *V << "\n"); return V; } @@ -1956,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, // use a shuffle vector to widen it with undef elements, and then // a second shuffle vector to select between the loaded vector and the // incoming vector. - SmallVector<Constant*, 8> Mask; + SmallVector<Constant *, 8> Mask; Mask.reserve(VecTy->getNumElements()); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) if (i >= BeginIndex && i < EndIndex) @@ -1964,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, else Mask.push_back(UndefValue::get(IRB.getInt32Ty())); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), - ConstantVector::get(Mask), - Name + ".expand"); + ConstantVector::get(Mask), Name + ".expand"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); Mask.clear(); @@ -1991,12 +2363,18 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; const DataLayout &DL; - AllocaSlices &S; + AllocaSlices &AS; SROA &Pass; AllocaInst &OldAI, &NewAI; const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; Type *NewAllocaTy; + // This is a convenience and flag variable that will be null unless the new + // alloca's integer operations should be widened to this integer type due to + // passing isIntegerWideningViable above. If it is non-null, the desired + // integer type will be stored here for easy access during rewriting. + IntegerType *IntTy; + // If we are rewriting an alloca partition which can be written as pure // vector operations, we stash extra information here. When VecTy is // non-null, we have some strict guarantees about the rewritten alloca: @@ -2010,12 +2388,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { Type *ElementTy; uint64_t ElementSize; - // This is a convenience and flag variable that will be null unless the new - // alloca's integer operations should be widened to this integer type due to - // passing isIntegerWideningViable above. If it is non-null, the desired - // integer type will be stored here for easy access during rewriting. - IntegerType *IntTy; - // The original offset of the slice currently being rewritten relative to // the original alloca. uint64_t BeginOffset, EndOffset; @@ -2038,25 +2410,25 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { IRBuilderTy IRB; public: - AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, + AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, - uint64_t NewAllocaEndOffset, bool IsVectorPromotable, - bool IsIntegerPromotable, + uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, + VectorType *PromotableVecTy, SmallPtrSetImpl<PHINode *> &PHIUsers, SmallPtrSetImpl<SelectInst *> &SelectUsers) - : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), + : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), NewAllocaBeginOffset(NewAllocaBeginOffset), NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAI.getAllocatedType()), - VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr), - ElementTy(VecTy ? VecTy->getElementType() : nullptr), - ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), IntTy(IsIntegerPromotable ? Type::getIntNTy( NewAI.getContext(), DL.getTypeSizeInBits(NewAI.getAllocatedType())) : nullptr), + VecTy(PromotableVecTy), + ElementTy(VecTy ? VecTy->getElementType() : nullptr), + ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { @@ -2065,8 +2437,7 @@ public: "Only multiple-of-8 sized vector elements are viable"); ++NumVectorized; } - assert((!IsVectorPromotable && !IsIntegerPromotable) || - IsVectorPromotable != IsIntegerPromotable); + assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy)); } bool visit(AllocaSlices::const_iterator I) { @@ -2076,6 +2447,9 @@ public: IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); + DEBUG(AS.printSlice(dbgs(), I, "")); + DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -2146,7 +2520,8 @@ private: ); } - /// \brief Compute suitable alignment to access this slice of the *new* alloca. + /// \brief Compute suitable alignment to access this slice of the *new* + /// alloca. /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. @@ -2154,7 +2529,8 @@ private: unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + unsigned Align = + MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } @@ -2178,16 +2554,14 @@ private: unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; @@ -2212,8 +2586,8 @@ private: V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { - V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - LI.isVolatile(), LI.getName()); + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), + LI.getName()); } else { Type *LTy = TargetTy->getPointerTo(); V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), @@ -2230,7 +2604,7 @@ private: assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(LI.getType()) && + DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); @@ -2238,9 +2612,9 @@ private: // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. - Value *Placeholder - = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, + Value *Placeholder = + new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); @@ -2262,15 +2636,14 @@ private: assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); - Type *SliceTy = - (NumElements == 1) ? ElementTy - : VectorType::get(ElementTy, NumElements); + Type *SliceTy = (NumElements == 1) + ? ElementTy + : VectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) V = convertValue(DL, IRB, V, SliceTy); // Mix in the existing elements. - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = insertVector(IRB, Old, V, BeginIndex, "vec"); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2285,13 +2658,12 @@ private: assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, - "insert"); + V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2319,10 +2691,10 @@ private: assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == - DL.getTypeStoreSizeInBits(V->getType()) && + DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); - V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, "extract"); } @@ -2367,14 +2739,14 @@ private: if (Size == 1) return V; - Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); - V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"), - ConstantExpr::getUDiv( - Constant::getAllOnesValue(SplatIntTy), - ConstantExpr::getZExt( - Constant::getAllOnesValue(V->getType()), - SplatIntTy)), - "isplat"); + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); + V = IRB.CreateMul( + IRB.CreateZExt(V, SplatIntTy, "zext"), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + "isplat"); return V; } @@ -2411,11 +2783,11 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && - (BeginOffset > NewAllocaBeginOffset || - EndOffset < NewAllocaEndOffset || + (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || + SliceSize != DL.getTypeStoreSize(AllocaTy) || !AllocaTy->isSingleValueType() || !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || - DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { + DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) { Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( @@ -2449,8 +2821,8 @@ private: if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); } else if (IntTy) { // If this is a memset on an alloca where we can widen stores, insert the @@ -2462,8 +2834,8 @@ private: if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); @@ -2535,10 +2907,11 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memcpy. - bool EmitMemCpy - = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || - EndOffset < NewAllocaEndOffset || - !NewAI.getAllocatedType()->isSingleValueType()); + bool EmitMemCpy = + !VecTy && !IntTy && + (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || + SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) || + !NewAI.getAllocatedType()->isSingleValueType()); // If we're just going to emit a memcpy, the alloca hasn't changed, and the // size hasn't been shrunk based on analysis of the viable range, this is @@ -2559,8 +2932,8 @@ private: // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); - if (AllocaInst *AI - = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { + if (AllocaInst *AI = + dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { assert(AI != &OldAI && AI != &NewAI && "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); @@ -2599,8 +2972,8 @@ private: unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; - IntegerType *SubIntTy - = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; + IntegerType *SubIntTy = + IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; // Reset the other pointer type to match the register type we're going to // use, but using the address space of the original other pointer. @@ -2629,27 +3002,25 @@ private: Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); } else if (IntTy && !IsWholeAlloca && !IsDest) { - Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = convertValue(DL, IRB, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { - Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), - "copyload"); + Src = + IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } if (VecTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); } else if (IntTy && !IsWholeAlloca && IsDest) { - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "oldload"); + Value *Old = + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); @@ -2672,8 +3043,8 @@ private: // Record this instruction for deletion. Pass.DeadInsts.insert(&II); - ConstantInt *Size - = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + ConstantInt *Size = + ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; @@ -2740,7 +3111,6 @@ private: SelectUsers.insert(&SI); return true; } - }; } @@ -2787,7 +3157,7 @@ private: /// This uses a set to de-duplicate users. void enqueueUsers(Instruction &I) { for (Use &U : I.uses()) - if (Visited.insert(U.getUser())) + if (Visited.insert(U.getUser()).second) Queue.push_back(&U); } @@ -2795,8 +3165,7 @@ private: bool visitInstruction(Instruction &I) { return false; } /// \brief Generic recursive split emission class. - template <typename Derived> - class OpSplitter { + template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. IRBuilderTy IRB; @@ -2813,7 +3182,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} public: /// \brief Generic recursive split emission routine. @@ -2869,7 +3238,7 @@ private: struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. @@ -2900,7 +3269,7 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) - : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. @@ -2908,8 +3277,8 @@ private: assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( - IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -2995,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const DataLayout &DL, Type *Ty, - uint64_t Offset, uint64_t Size) { +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, + uint64_t Size) { if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || @@ -3088,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, } // Try to build up a sub-structure. - StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), - STy->isPacked()); + StructType *SubTy = + StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return nullptr; // The sub-struct doesn't have quite the size needed. @@ -3097,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, return SubTy; } +/// \brief Pre-split loads and stores to simplify rewriting. +/// +/// We want to break up the splittable load+store pairs as much as +/// possible. This is important to do as a preprocessing step, as once we +/// start rewriting the accesses to partitions of the alloca we lose the +/// necessary information to correctly split apart paired loads and stores +/// which both point into this alloca. The case to consider is something like +/// the following: +/// +/// %a = alloca [12 x i8] +/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 +/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 +/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 +/// %iptr1 = bitcast i8* %gep1 to i64* +/// %iptr2 = bitcast i8* %gep2 to i64* +/// %fptr1 = bitcast i8* %gep1 to float* +/// %fptr2 = bitcast i8* %gep2 to float* +/// %fptr3 = bitcast i8* %gep3 to float* +/// store float 0.0, float* %fptr1 +/// store float 1.0, float* %fptr2 +/// %v = load i64* %iptr1 +/// store i64 %v, i64* %iptr2 +/// %f1 = load float* %fptr2 +/// %f2 = load float* %fptr3 +/// +/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and +/// promote everything so we recover the 2 SSA values that should have been +/// there all along. +/// +/// \returns true if any changes are made. +bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { + DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + + // Track the loads and stores which are candidates for pre-splitting here, in + // the order they first appear during the partition scan. These give stable + // iteration order and a basis for tracking which loads and stores we + // actually split. + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + + // We need to accumulate the splits required of each load or store where we + // can find them via a direct lookup. This is important to cross-check loads + // and stores against each other. We also track the slice so that we can kill + // all the slices that end up split. + struct SplitOffsets { + Slice *S; + std::vector<uint64_t> Splits; + }; + SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; + + // Track loads out of this alloca which cannot, for any reason, be pre-split. + // This is important as we also cannot pre-split stores of those loads! + // FIXME: This is all pretty gross. It means that we can be more aggressive + // in pre-splitting when the load feeding the store happens to come from + // a separate alloca. Put another way, the effectiveness of SROA would be + // decreased by a frontend which just concatenated all of its local allocas + // into one big flat alloca. But defeating such patterns is exactly the job + // SROA is tasked with! Sadly, to not have this discrepancy we would have + // change store pre-splitting to actually force pre-splitting of the load + // that feeds it *and all stores*. That makes pre-splitting much harder, but + // maybe it would make it more principled? + SmallPtrSet<LoadInst *, 8> UnsplittableLoads; + + DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + for (auto &P : AS.partitions()) { + for (Slice &S : P) { + Instruction *I = cast<Instruction>(S.getUse()->getUser()); + if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { + // If this was a load we have to track that it can't participate in any + // pre-splitting! + if (auto *LI = dyn_cast<LoadInst>(I)) + UnsplittableLoads.insert(LI); + continue; + } + assert(P.endOffset() > S.beginOffset() && + "Empty or backwards partition!"); + + // Determine if this is a pre-splittable slice. + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(!LI->isVolatile() && "Cannot split volatile loads!"); + + // The load must be used exclusively to store into other pointers for + // us to be able to arbitrarily pre-split it. The stores must also be + // simple to avoid changing semantics. + auto IsLoadSimplyStored = [](LoadInst *LI) { + for (User *LU : LI->users()) { + auto *SI = dyn_cast<StoreInst>(LU); + if (!SI || !SI->isSimple()) + return false; + } + return true; + }; + if (!IsLoadSimplyStored(LI)) { + UnsplittableLoads.insert(LI); + continue; + } + + Loads.push_back(LI); + } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) { + if (!SI || + S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + continue; + auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!StoredLoad || !StoredLoad->isSimple()) + continue; + assert(!SI->isVolatile() && "Cannot split volatile stores!"); + + Stores.push_back(SI); + } else { + // Other uses cannot be pre-split. + continue; + } + + // Record the initial split. + DEBUG(dbgs() << " Candidate: " << *I << "\n"); + auto &Offsets = SplitOffsetsMap[I]; + assert(Offsets.Splits.empty() && + "Should not have splits the first time we see an instruction!"); + Offsets.S = &S; + Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); + } + + // Now scan the already split slices, and add a split for any of them which + // we're going to pre-split. + for (Slice *S : P.splitSliceTails()) { + auto SplitOffsetsMapI = + SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); + if (SplitOffsetsMapI == SplitOffsetsMap.end()) + continue; + auto &Offsets = SplitOffsetsMapI->second; + + assert(Offsets.S == S && "Found a mismatched slice!"); + assert(!Offsets.Splits.empty() && + "Cannot have an empty set of splits on the second partition!"); + assert(Offsets.Splits.back() == + P.beginOffset() - Offsets.S->beginOffset() && + "Previous split does not end where this one begins!"); + + // Record each split. The last partition's end isn't needed as the size + // of the slice dictates that. + if (S->endOffset() > P.endOffset()) + Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); + } + } + + // We may have split loads where some of their stores are split stores. For + // such loads and stores, we can only pre-split them if their splits exactly + // match relative to their starting offset. We have to verify this prior to + // any rewriting. + Stores.erase( + std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { + // Lookup the load we are storing in our map of split + // offsets. + auto *LI = cast<LoadInst>(SI->getValueOperand()); + // If it was completely unsplittable, then we're done, + // and this store can't be pre-split. + if (UnsplittableLoads.count(LI)) + return true; + + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are definitely safe. + auto &LoadOffsets = LoadOffsetsI->second; + + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; + + // If the relative offsets of each split in the load and + // store match exactly, then we can split them and we + // don't need to remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; + + DEBUG(dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); + + // We've found a store and load that we need to split + // with mismatched relative splits. Just give up on them + // and remove both instructions from our list of + // candidates. + UnsplittableLoads.insert(LI); + return true; + }), + Stores.end()); + // Now we have to go *back* through all te stores, because a later store may + // have caused an earlier store's load to become unsplittable and if it is + // unsplittable for the later store, then we can't rely on it being split in + // the earlier store either. + Stores.erase(std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads](StoreInst *SI) { + auto *LI = + cast<LoadInst>(SI->getValueOperand()); + return UnsplittableLoads.count(LI); + }), + Stores.end()); + // Once we've established all the loads that can't be split for some reason, + // filter any that made it into our list out. + Loads.erase(std::remove_if(Loads.begin(), Loads.end(), + [&UnsplittableLoads](LoadInst *LI) { + return UnsplittableLoads.count(LI); + }), + Loads.end()); + + + // If no loads or stores are left, there is no pre-splitting to be done for + // this alloca. + if (Loads.empty() && Stores.empty()) + return false; + + // From here on, we can't fail and will be building new accesses, so rig up + // an IR builder. + IRBuilderTy IRB(&AI); + + // Collect the new slices which we will merge into the alloca slices. + SmallVector<Slice, 4> NewSlices; + + // Track any allocas we end up splitting loads and stores for so we iterate + // on them. + SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; + + // At this point, we have collected all of the loads and stores we can + // pre-split, and the specific splits needed for them. We actually do the + // splitting in a specific order in order to handle when one of the loads in + // the value operand to one of the stores. + // + // First, we rewrite all of the split loads, and just accumulate each split + // load in a parallel structure. We also build the slices for them and append + // them to the alloca slices. + SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; + std::vector<LoadInst *> SplitLoads; + for (LoadInst *LI : Loads) { + SplitLoads.clear(); + + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t LoadSize = Ty->getBitWidth() / 8; + assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); + + auto &Offsets = SplitOffsetsMap[LI]; + assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + LoadSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + + DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + LoadInst *PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, BasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, BasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + + // Append this load onto the list of split loads so we can find it later + // to rewrite the stores. + SplitLoads.push_back(PLoad); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PLoad + << "\n"); + + // See if we've handled all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + } + + // Now that we have the split loads, do the slow walk over all uses of the + // load and rewrite them as split stores, or save the split loads to use + // below if the store is going to be split there anyways. + bool DeferredStores = false; + for (User *LU : LI->users()) { + StoreInst *SI = cast<StoreInst>(LU); + if (!Stores.empty() && SplitOffsetsMap.count(SI)) { + DeferredStores = true; + DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + continue; + } + + Value *StoreBasePtr = SI->getPointerOperand(); + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + + DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + + for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { + LoadInst *PLoad = SplitLoads[Idx]; + uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; + auto *PartPtrTy = + PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + (void)PStore; + DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + } + + // We want to immediately iterate on any allocas impacted by splitting + // this store, and we have to track any promotable alloca (indicated by + // a direct store) as needing to be resplit because it is no longer + // promotable. + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + StoreBasePtr->stripInBoundsOffsets())) { + Worklist.insert(OtherAI); + } + + // Mark the original store as dead. + DeadInsts.insert(SI); + } + + // Save the split loads if there are deferred stores among the users. + if (DeferredStores) + SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); + + // Mark the original load as dead and kill the original slice. + DeadInsts.insert(LI); + Offsets.S->kill(); + } + + // Second, we rewrite all of the split stores. At this point, we know that + // all loads from this alloca have been split already. For stores of such + // loads, we can simply look up the pre-existing split loads. For stores of + // other loads, we split those loads first and then write split stores of + // them. + for (StoreInst *SI : Stores) { + auto *LI = cast<LoadInst>(SI->getValueOperand()); + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t StoreSize = Ty->getBitWidth() / 8; + assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); + + auto &Offsets = SplitOffsetsMap[SI]; + assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + StoreSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Value *LoadBasePtr = LI->getPointerOperand(); + Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); + + DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + + // Check whether we have an already split load. + auto SplitLoadsMapI = SplitLoadsMap.find(LI); + std::vector<LoadInst *> *SplitLoads = nullptr; + if (SplitLoadsMapI != SplitLoadsMap.end()) { + SplitLoads = &SplitLoadsMapI->second; + assert(SplitLoads->size() == Offsets.Splits.size() + 1 && + "Too few split loads for the number of splits in the store!"); + } else { + DEBUG(dbgs() << " of load: " << *LI << "\n"); + } + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + + // Either lookup a split load or create one. + LoadInst *PLoad; + if (SplitLoads) { + PLoad = (*SplitLoads)[Idx]; + } else { + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, LoadBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, LoadBasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + } + + // And store this partition. + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PStore->getOperandUse(PStore->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PStore + << "\n"); + if (!SplitLoads) { + DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + } + + // See if we've finished all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; + } + + // We want to immediately iterate on any allocas impacted by splitting + // this load, which is only relevant if it isn't a load of this alloca and + // thus we didn't already split the loads above. We also have to keep track + // of any promotable allocas we split loads on as they can no longer be + // promoted. + if (!SplitLoads) { + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + LoadBasePtr->stripInBoundsOffsets())) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + Worklist.insert(OtherAI); + } + } + + // Mark the original store as dead now that we've split it up and kill its + // slice. Note that we leave the original load in place unless this store + // was its ownly use. It may in turn be split up if it is an alloca load + // for some other alloca, but it may be a normal load. This may introduce + // redundant loads, but where those can be merged the rest of the optimizer + // should handle the merging, and this uncovers SSA splits which is more + // important. In practice, the original loads will almost always be fully + // split and removed eventually, and the splits will be merged by any + // trivial CSE, including instcombine. + if (LI->hasOneUse()) { + assert(*LI->user_begin() == SI && "Single use isn't this store!"); + DeadInsts.insert(LI); + } + DeadInsts.insert(SI); + Offsets.S->kill(); + } + + // Remove the killed slices that have ben pre-split. + AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { + return S.isDead(); + }), AS.end()); + + // Insert our new slices. This will sort and merge them into the sorted + // sequence. + AS.insert(NewSlices); + + DEBUG(dbgs() << " Pre-split slices:\n"); +#ifndef NDEBUG + for (auto I = AS.begin(), E = AS.end(); I != E; ++I) + DEBUG(AS.print(dbgs(), I, " ")); +#endif + + // Finally, don't try to promote any allocas that new require re-splitting. + // They have already been added to the worklist above. + PromotableAllocas.erase( + std::remove_if( + PromotableAllocas.begin(), PromotableAllocas.end(), + [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), + PromotableAllocas.end()); + + return true; +} + /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries @@ -3107,38 +3964,33 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, - AllocaSlices::iterator B, AllocaSlices::iterator E, - int64_t BeginOffset, int64_t EndOffset, - ArrayRef<AllocaSlices::iterator> SplitUses) { - assert(BeginOffset < EndOffset); - uint64_t SliceSize = EndOffset - BeginOffset; - +bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; - if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) - if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) + if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) + if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), - BeginOffset, SliceSize)) + P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(SliceSize * 8)) - SliceTy = Type::getIntNTy(*C, SliceSize * 8); + DL->isLegalInteger(P.size() * 8)) + SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) - SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); - assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); + SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); + assert(DL->getTypeAllocSize(SliceTy) >= P.size()); - bool IsVectorPromotable = isVectorPromotionViable( - *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); - bool IsIntegerPromotable = - !IsVectorPromotable && - isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses); + VectorType *VecTy = + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); + if (VecTy) + SliceTy = VecTy; // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that @@ -3146,7 +3998,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // perform phi and select speculation. AllocaInst *NewAI; if (SliceTy == AI.getAllocatedType()) { - assert(BeginOffset == 0 && + assert(P.beginOffset() == 0 && "Non-zero begin offset but same alloca type"); NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". @@ -3159,19 +4011,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // type. Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); } - Alignment = MinAlign(Alignment, BeginOffset); + Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = new AllocaInst(SliceTy, nullptr, Alignment, - AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); + NewAI = new AllocaInst( + SliceTy, nullptr, Alignment, + AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI - << "\n"); + << "[" << P.beginOffset() << "," << P.endOffset() + << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in @@ -3181,22 +4034,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, SmallPtrSet<PHINode *, 8> PHIUsers; SmallPtrSet<SelectInst *, 8> SelectUsers; - AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, - EndOffset, IsVectorPromotable, - IsIntegerPromotable, PHIUsers, SelectUsers); + AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + P.endOffset(), IsIntegerPromotable, VecTy, + PHIUsers, SelectUsers); bool Promotable = true; - for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), - SUE = SplitUses.end(); - SUI != SUE; ++SUI) { - DEBUG(dbgs() << " rewriting split "); - DEBUG(S.printSlice(dbgs(), *SUI, "")); - Promotable &= Rewriter.visit(*SUI); + for (Slice *S : P.splitSliceTails()) { + Promotable &= Rewriter.visit(S); ++NumUses; } - for (AllocaSlices::iterator I = B; I != E; ++I) { - DEBUG(dbgs() << " rewriting "); - DEBUG(S.printSlice(dbgs(), I, "")); - Promotable &= Rewriter.visit(I); + for (Slice &S : P) { + Promotable &= Rewriter.visit(&S); ++NumUses; } @@ -3233,14 +4080,10 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, // If we have either PHIs or Selects to speculate, add them to those // worklists and re-queue the new alloca so that we promote in on the // next iteration. - for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), - E = PHIUsers.end(); - I != E; ++I) - SpeculatablePHIs.insert(*I); - for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), - E = SelectUsers.end(); - I != E; ++I) - SpeculatableSelects.insert(*I); + for (PHINode *PHIUser : PHIUsers) + SpeculatablePHIs.insert(PHIUser); + for (SelectInst *SelectUser : SelectUsers) + SpeculatableSelects.insert(SelectUser); Worklist.insert(NewAI); } } else { @@ -3258,136 +4101,46 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, return true; } -static void -removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, - uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { - if (Offset >= MaxSplitUseEndOffset) { - SplitUses.clear(); - MaxSplitUseEndOffset = 0; - return; - } - - size_t SplitUsesOldSize = SplitUses.size(); - SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), - [Offset](const AllocaSlices::iterator &I) { - return I->endOffset() <= Offset; - }), - SplitUses.end()); - if (SplitUsesOldSize == SplitUses.size()) - return; - - // Recompute the max. While this is linear, so is remove_if. - MaxSplitUseEndOffset = 0; - for (SmallVectorImpl<AllocaSlices::iterator>::iterator - SUI = SplitUses.begin(), - SUE = SplitUses.end(); - SUI != SUE; ++SUI) - MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset); -} - /// \brief Walks the slices of an alloca and form partitions based on them, /// rewriting each of their uses. -bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { - if (S.begin() == S.end()) +bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { + if (AS.begin() == AS.end()) return false; unsigned NumPartitions = 0; bool Changed = false; - SmallVector<AllocaSlices::iterator, 4> SplitUses; - uint64_t MaxSplitUseEndOffset = 0; - - uint64_t BeginOffset = S.begin()->beginOffset(); - - for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end(); - SI != SE; SI = SJ) { - uint64_t MaxEndOffset = SI->endOffset(); - - if (!SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at the - // first slice and will extend through its end. - assert(BeginOffset == SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - if (!SJ->isSplittable()) - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } - } else { - assert(SI->isSplittable()); // Established above. - // Collect all of the overlapping splittable slices. - while (SJ != SE && SJ->beginOffset() < MaxEndOffset && - SJ->isSplittable()) { - MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); - ++SJ; - } + // First try to pre-split loads and stores. + Changed |= presplitLoadsAndStores(AI, AS); - // Back up MaxEndOffset and SJ if we ended the span early when - // encountering an unsplittable slice. - if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { - assert(!SJ->isSplittable()); - MaxEndOffset = SJ->beginOffset(); - } - } - - // Check if we have managed to move the end offset forward yet. If so, - // we'll have to rewrite uses and erase old split uses. - if (BeginOffset < MaxEndOffset) { - // Rewrite a sequence of overlapping slices. - Changed |= - rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses); - ++NumPartitions; - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); - } - - // Accumulate all the splittable slices from the [SI,SJ) region which - // overlap going forward. - for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) - if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { - SplitUses.push_back(SK); - MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); - } - - // If we're already at the end and we have no split uses, we're done. - if (SJ == SE && SplitUses.empty()) - break; - - // If we have no split uses or no gap in offsets, we're ready to move to - // the next slice. - if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { - BeginOffset = SJ->beginOffset(); + // Now that we have identified any pre-splitting opportunities, mark any + // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail + // to split these during pre-splitting, we want to force them to be + // rewritten into a partition. + bool IsSorted = true; + for (Slice &S : AS) { + if (!S.isSplittable()) continue; - } - - // Even if we have split slices, if the next slice is splittable and the - // split slices reach it, we can simply set up the beginning offset of the - // next iteration to bridge between them. - if (SJ != SE && SJ->isSplittable() && - MaxSplitUseEndOffset > SJ->beginOffset()) { - BeginOffset = MaxEndOffset; + // FIXME: We currently leave whole-alloca splittable loads and stores. This + // used to be the only splittable loads and stores and we need to be + // confident that the above handling of splittable loads and stores is + // completely sufficient before we forcibly disable the remaining handling. + if (S.beginOffset() == 0 && + S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) continue; + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; } + } + if (!IsSorted) + std::sort(AS.begin(), AS.end()); - // Otherwise, we have a tail of split slices. Rewrite them with an empty - // range of slices. - uint64_t PostSplitEndOffset = - SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); - - Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset, - SplitUses); + // Rewrite each partition. + for (auto &P : AS.partitions()) { + Changed |= rewritePartition(AI, AS, P); ++NumPartitions; - - if (SJ == SE) - break; // Skip the rest, we don't need to do any cleanup. - - removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, - PostSplitEndOffset); - - // Now just reset the begin offset for the next iteration. - BeginOffset = SJ->beginOffset(); } NumAllocaPartitions += NumPartitions; @@ -3440,38 +4193,34 @@ bool SROA::runOnAlloca(AllocaInst &AI) { Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. - AllocaSlices S(*DL, AI); - DEBUG(S.print(dbgs())); - if (S.isEscaped()) + AllocaSlices AS(*DL, AI); + DEBUG(AS.print(dbgs())); + if (AS.isEscaped()) return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. - for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), - DE = S.dead_user_end(); - DI != DE; ++DI) { + for (Instruction *DeadUser : AS.getDeadUsers()) { // Free up everything used by this instruction. - for (Use &DeadOp : (*DI)->operands()) + for (Use &DeadOp : DeadUser->operands()) clobberUse(DeadOp); // Now replace the uses of this instruction. - (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType())); // And mark it for deletion. - DeadInsts.insert(*DI); + DeadInsts.insert(DeadUser); Changed = true; } - for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), - DE = S.dead_op_end(); - DO != DE; ++DO) { - clobberUse(**DO); + for (Use *DeadOp : AS.getDeadOperands()) { + clobberUse(*DeadOp); Changed = true; } // No slices to split. Leave the dead alloca for a later pass to clean up. - if (S.begin() == S.end()) + if (AS.begin() == AS.end()) return Changed; - Changed |= splitAlloca(AI, S); + Changed |= splitAlloca(AI, AS); DEBUG(dbgs() << " Speculating PHIs\n"); while (!SpeculatablePHIs.empty()) @@ -3493,7 +4242,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) { /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. -void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { +void SROA::deleteDeadInstructions( + SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); @@ -3518,9 +4268,9 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { static void enqueueUsersInWorklist(Instruction &I, SmallVectorImpl<Instruction *> &Worklist, - SmallPtrSet<Instruction *, 8> &Visited) { + SmallPtrSetImpl<Instruction *> &Visited) { for (User *U : I.users()) - if (Visited.insert(cast<Instruction>(U))) + if (Visited.insert(cast<Instruction>(U)).second) Worklist.push_back(cast<Instruction>(U)); } @@ -3540,14 +4290,14 @@ bool SROA::promoteAllocas(Function &F) { if (DT && !ForceSSAUpdater) { DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<Instruction *, 64> Insts; // We need a worklist to walk the uses of each alloca. @@ -3622,6 +4372,7 @@ bool SROA::runOnFunction(Function &F) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -3642,9 +4393,7 @@ bool SROA::runOnFunction(Function &F) { // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { - auto IsInSet = [&](AllocaInst *AI) { - return DeletedAllocas.count(AI); - }; + auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; Worklist.remove_if(IsInSet); PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), @@ -3665,6 +4414,7 @@ bool SROA::runOnFunction(Function &F) { } void SROA::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AssumptionCacheTracker>(); if (RequiresDomTree) AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp index 73c97ffeef4f..179bbf78366d 100644 --- a/lib/Transforms/Scalar/SampleProfile.cpp +++ b/lib/Transforms/Scalar/SampleProfile.cpp @@ -26,7 +26,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/PostDominators.h" @@ -42,15 +41,14 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/LineIterator.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" #include <cctype> using namespace llvm; +using namespace sampleprof; #define DEBUG_TYPE "sample-profile" @@ -65,76 +63,48 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations( "sample block/edge weights through the CFG.")); namespace { -/// \brief Represents the relative location of an instruction. -/// -/// Instruction locations are specified by the line offset from the -/// beginning of the function (marked by the line where the function -/// header is) and the discriminator value within that line. -/// -/// The discriminator value is useful to distinguish instructions -/// that are on the same line but belong to different basic blocks -/// (e.g., the two post-increment instructions in "if (p) x++; else y++;"). -struct InstructionLocation { - InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {} - int LineOffset; - unsigned Discriminator; -}; -} - -namespace llvm { -template <> struct DenseMapInfo<InstructionLocation> { - typedef DenseMapInfo<int> OffsetInfo; - typedef DenseMapInfo<unsigned> DiscriminatorInfo; - static inline InstructionLocation getEmptyKey() { - return InstructionLocation(OffsetInfo::getEmptyKey(), - DiscriminatorInfo::getEmptyKey()); - } - static inline InstructionLocation getTombstoneKey() { - return InstructionLocation(OffsetInfo::getTombstoneKey(), - DiscriminatorInfo::getTombstoneKey()); - } - static inline unsigned getHashValue(InstructionLocation Val) { - return DenseMapInfo<std::pair<int, unsigned>>::getHashValue( - std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator)); - } - static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) { - return LHS.LineOffset == RHS.LineOffset && - LHS.Discriminator == RHS.Discriminator; - } -}; -} - -namespace { -typedef DenseMap<InstructionLocation, unsigned> BodySampleMap; typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; typedef std::pair<BasicBlock *, BasicBlock *> Edge; typedef DenseMap<Edge, unsigned> EdgeWeightMap; typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; -/// \brief Representation of the runtime profile for a function. +/// \brief Sample profile pass. /// -/// This data structure contains the runtime profile for a given -/// function. It contains the total number of samples collected -/// in the function and a map of samples collected in every statement. -class SampleFunctionProfile { +/// This pass reads profile data from the file specified by +/// -sample-profile-file and annotates every affected function with the +/// profile information found in that file. +class SampleProfileLoader : public FunctionPass { public: - SampleFunctionProfile() - : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr), - PDT(nullptr), LI(nullptr), Ctx(nullptr) {} + // Class identification, replacement for typeinfo + static char ID; + + SampleProfileLoader(StringRef Name = SampleProfileFile) + : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr), + Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) { + initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); + } + + bool doInitialization(Module &M) override; + + void dump() { Reader->dump(); } + + const char *getPassName() const override { return "Sample profile pass"; } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTree>(); + } +protected: unsigned getFunctionLoc(Function &F); - bool emitAnnotations(Function &F, DominatorTree *DomTree, - PostDominatorTree *PostDomTree, LoopInfo *Loops); + bool emitAnnotations(Function &F); unsigned getInstWeight(Instruction &I); - unsigned getBlockWeight(BasicBlock *B); - void addTotalSamples(unsigned Num) { TotalSamples += Num; } - void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; } - void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) { - assert(LineOffset >= 0); - BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num; - } - void print(raw_ostream &OS); + unsigned getBlockWeight(BasicBlock *BB); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, BasicBlock *BB); void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); @@ -147,32 +117,11 @@ public: unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); bool propagateThroughEdges(Function &F); - bool empty() { return BodySamples.empty(); } -protected: - /// \brief Total number of samples collected inside this function. - /// - /// Samples are cumulative, they include all the samples collected - /// inside this function and all its inlined callees. - unsigned TotalSamples; - - /// \brief Total number of samples collected at the head of the function. - /// FIXME: Use head samples to estimate a cold/hot attribute for the function. - unsigned TotalHeadSamples; - - /// \brief Line number for the function header. Used to compute relative - /// line numbers from the absolute line LOCs found in instruction locations. - /// The relative line numbers are needed to address the samples from the - /// profile file. + /// \brief Line number for the function header. Used to compute absolute + /// line numbers from the relative line numbers found in the profile. unsigned HeaderLineno; - /// \brief Map line offsets to collected samples. - /// - /// Each entry in this map contains the number of samples - /// collected at the corresponding line offset. All line locations - /// are an offset from the start of the function. - BodySampleMap BodySamples; - /// \brief Map basic blocks to their computed weights. /// /// The weight of a basic block is defined to be the maximum @@ -212,105 +161,12 @@ protected: /// \brief LLVM context holding the debug data we need. LLVMContext *Ctx; -}; - -/// \brief Sample-based profile reader. -/// -/// Each profile contains sample counts for all the functions -/// executed. Inside each function, statements are annotated with the -/// collected samples on all the instructions associated with that -/// statement. -/// -/// For this to produce meaningful data, the program needs to be -/// compiled with some debug information (at minimum, line numbers: -/// -gline-tables-only). Otherwise, it will be impossible to match IR -/// instructions to the line numbers collected by the profiler. -/// -/// From the profile file, we are interested in collecting the -/// following information: -/// -/// * A list of functions included in the profile (mangled names). -/// -/// * For each function F: -/// 1. The total number of samples collected in F. -/// -/// 2. The samples collected at each line in F. To provide some -/// protection against source code shuffling, line numbers should -/// be relative to the start of the function. -class SampleModuleProfile { -public: - SampleModuleProfile(const Module &M, StringRef F) - : Profiles(0), Filename(F), M(M) {} - - void dump(); - bool loadText(); - void loadNative() { llvm_unreachable("not implemented"); } - void printFunctionProfile(raw_ostream &OS, StringRef FName); - void dumpFunctionProfile(StringRef FName); - SampleFunctionProfile &getProfile(const Function &F) { - return Profiles[F.getName()]; - } - /// \brief Report a parse error message. - void reportParseError(int64_t LineNumber, Twine Msg) const { - DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg); - M.getContext().diagnose(Diag); - } - -protected: - /// \brief Map every function to its associated profile. - /// - /// The profile of every function executed at runtime is collected - /// in the structure SampleFunctionProfile. This maps function objects - /// to their corresponding profiles. - StringMap<SampleFunctionProfile> Profiles; - - /// \brief Path name to the file holding the profile data. - /// - /// The format of this file is defined by each profiler - /// independently. If possible, the profiler should have a text - /// version of the profile format to be used in constructing test - /// cases and debugging. - StringRef Filename; - - /// \brief Module being compiled. Used mainly to access the current - /// LLVM context for diagnostics. - const Module &M; -}; - -/// \brief Sample profile pass. -/// -/// This pass reads profile data from the file specified by -/// -sample-profile-file and annotates every affected function with the -/// profile information found in that file. -class SampleProfileLoader : public FunctionPass { -public: - // Class identification, replacement for typeinfo - static char ID; - - SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) { - initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); - } - - bool doInitialization(Module &M) override; - - void dump() { Profiler->dump(); } - - const char *getPassName() const override { return "Sample profile pass"; } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<LoopInfo>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTree>(); - } - -protected: /// \brief Profile reader object. - std::unique_ptr<SampleModuleProfile> Profiler; + std::unique_ptr<SampleProfileReader> Reader; + + /// \brief Samples collected for the body of this function. + FunctionSamples *Samples; /// \brief Name of the profile file to load. StringRef Filename; @@ -320,26 +176,11 @@ protected: }; } -/// \brief Print this function profile on stream \p OS. -/// -/// \param OS Stream to emit the output to. -void SampleFunctionProfile::print(raw_ostream &OS) { - OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size() - << " sampled lines\n"; - for (BodySampleMap::const_iterator SI = BodySamples.begin(), - SE = BodySamples.end(); - SI != SE; ++SI) - OS << "\tline offset: " << SI->first.LineOffset - << ", discriminator: " << SI->first.Discriminator - << ", number of samples: " << SI->second << "\n"; - OS << "\n"; -} - /// \brief Print the weight of edge \p E on stream \p OS. /// /// \param OS Stream to emit the output to. /// \param E Edge to print. -void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) { +void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { OS << "weight[" << E.first->getName() << "->" << E.second->getName() << "]: " << EdgeWeights[E] << "\n"; } @@ -348,8 +189,8 @@ void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) { /// /// \param OS Stream to emit the output to. /// \param BB Block to print. -void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS, - BasicBlock *BB) { +void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, + BasicBlock *BB) { BasicBlock *Equiv = EquivalenceClass[BB]; OS << "equivalence[" << BB->getName() << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; @@ -359,174 +200,10 @@ void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS, /// /// \param OS Stream to emit the output to. /// \param BB Block to print. -void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { +void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; } -/// \brief Print the function profile for \p FName on stream \p OS. -/// -/// \param OS Stream to emit the output to. -/// \param FName Name of the function to print. -void SampleModuleProfile::printFunctionProfile(raw_ostream &OS, - StringRef FName) { - OS << "Function: " << FName << ":\n"; - Profiles[FName].print(OS); -} - -/// \brief Dump the function profile for \p FName. -/// -/// \param FName Name of the function to print. -void SampleModuleProfile::dumpFunctionProfile(StringRef FName) { - printFunctionProfile(dbgs(), FName); -} - -/// \brief Dump all the function profiles found. -void SampleModuleProfile::dump() { - for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(), - E = Profiles.end(); - I != E; ++I) - dumpFunctionProfile(I->getKey()); -} - -/// \brief Load samples from a text file. -/// -/// The file contains a list of samples for every function executed at -/// runtime. Each function profile has the following format: -/// -/// function1:total_samples:total_head_samples -/// offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ] -/// offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ] -/// ... -/// offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ] -/// -/// Function names must be mangled in order for the profile loader to -/// match them in the current translation unit. The two numbers in the -/// function header specify how many total samples were accumulated in -/// the function (first number), and the total number of samples accumulated -/// at the prologue of the function (second number). This head sample -/// count provides an indicator of how frequent is the function invoked. -/// -/// Each sampled line may contain several items. Some are optional -/// (marked below): -/// -/// a- Source line offset. This number represents the line number -/// in the function where the sample was collected. The line number -/// is always relative to the line where symbol of the function -/// is defined. So, if the function has its header at line 280, -/// the offset 13 is at line 293 in the file. -/// -/// b- [OPTIONAL] Discriminator. This is used if the sampled program -/// was compiled with DWARF discriminator support -/// (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators) -/// -/// c- Number of samples. This is the number of samples collected by -/// the profiler at this source location. -/// -/// d- [OPTIONAL] Potential call targets and samples. If present, this -/// line contains a call instruction. This models both direct and -/// indirect calls. Each called target is listed together with the -/// number of samples. For example, -/// -/// 130: 7 foo:3 bar:2 baz:7 -/// -/// The above means that at relative line offset 130 there is a -/// call instruction that calls one of foo(), bar() and baz(). With -/// baz() being the relatively more frequent call target. -/// -/// FIXME: This is currently unhandled, but it has a lot of -/// potential for aiding the inliner. -/// -/// -/// Since this is a flat profile, a function that shows up more than -/// once gets all its samples aggregated across all its instances. -/// -/// FIXME: flat profiles are too imprecise to provide good optimization -/// opportunities. Convert them to context-sensitive profile. -/// -/// This textual representation is useful to generate unit tests and -/// for debugging purposes, but it should not be used to generate -/// profiles for large programs, as the representation is extremely -/// inefficient. -/// -/// \returns true if the file was loaded successfully, false otherwise. -bool SampleModuleProfile::loadText() { - ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = - MemoryBuffer::getFile(Filename); - if (std::error_code EC = BufferOrErr.getError()) { - std::string Msg(EC.message()); - M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); - return false; - } - std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); - line_iterator LineIt(*Buffer, '#'); - - // Read the profile of each function. Since each function may be - // mentioned more than once, and we are collecting flat profiles, - // accumulate samples as we parse them. - Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$"); - Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$"); - while (!LineIt.is_at_eof()) { - // Read the header of each function. - // - // Note that for function identifiers we are actually expecting - // mangled names, but we may not always get them. This happens when - // the compiler decides not to emit the function (e.g., it was inlined - // and removed). In this case, the binary will not have the linkage - // name for the function, so the profiler will emit the function's - // unmangled name, which may contain characters like ':' and '>' in its - // name (member functions, templates, etc). - // - // The only requirement we place on the identifier, then, is that it - // should not begin with a number. - SmallVector<StringRef, 3> Matches; - if (!HeadRE.match(*LineIt, &Matches)) { - reportParseError(LineIt.line_number(), - "Expected 'mangled_name:NUM:NUM', found " + *LineIt); - return false; - } - assert(Matches.size() == 4); - StringRef FName = Matches[1]; - unsigned NumSamples, NumHeadSamples; - Matches[2].getAsInteger(10, NumSamples); - Matches[3].getAsInteger(10, NumHeadSamples); - Profiles[FName] = SampleFunctionProfile(); - SampleFunctionProfile &FProfile = Profiles[FName]; - FProfile.addTotalSamples(NumSamples); - FProfile.addHeadSamples(NumHeadSamples); - ++LineIt; - - // Now read the body. The body of the function ends when we reach - // EOF or when we see the start of the next function. - while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) { - if (!LineSample.match(*LineIt, &Matches)) { - reportParseError( - LineIt.line_number(), - "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt); - return false; - } - assert(Matches.size() == 5); - unsigned LineOffset, NumSamples, Discriminator = 0; - Matches[1].getAsInteger(10, LineOffset); - if (Matches[2] != "") - Matches[2].getAsInteger(10, Discriminator); - Matches[3].getAsInteger(10, NumSamples); - - // FIXME: Handle called targets (in Matches[4]). - - // When dealing with instruction weights, we use the value - // zero to indicate the absence of a sample. If we read an - // actual zero from the profile file, return it as 1 to - // avoid the confusion later on. - if (NumSamples == 0) - NumSamples = 1; - FProfile.addBodySamples(LineOffset, Discriminator, NumSamples); - ++LineIt; - } - } - - return true; -} - /// \brief Get the weight for an instruction. /// /// The "weight" of an instruction \p Inst is the number of samples @@ -538,7 +215,7 @@ bool SampleModuleProfile::loadText() { /// \param Inst Instruction to query. /// /// \returns The profiled weight of I. -unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) { +unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { DebugLoc DLoc = Inst.getDebugLoc(); unsigned Lineno = DLoc.getLine(); if (Lineno < HeaderLineno) @@ -547,8 +224,7 @@ unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) { DILocation DIL(DLoc.getAsMDNode(*Ctx)); int LOffset = Lineno - HeaderLineno; unsigned Discriminator = DIL.getDiscriminator(); - unsigned Weight = - BodySamples.lookup(InstructionLocation(LOffset, Discriminator)); + unsigned Weight = Samples->samplesAt(LOffset, Discriminator); DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst << " (line offset: " << LOffset << "." << Discriminator << " - weight: " << Weight << ")\n"); @@ -557,24 +233,24 @@ unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) { /// \brief Compute the weight of a basic block. /// -/// The weight of basic block \p B is the maximum weight of all the -/// instructions in B. The weight of \p B is computed and cached in +/// The weight of basic block \p BB is the maximum weight of all the +/// instructions in BB. The weight of \p BB is computed and cached in /// the BlockWeights map. /// -/// \param B The basic block to query. +/// \param BB The basic block to query. /// -/// \returns The computed weight of B. -unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) { - // If we've computed B's weight before, return it. +/// \returns The computed weight of BB. +unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { + // If we've computed BB's weight before, return it. std::pair<BlockWeightMap::iterator, bool> Entry = - BlockWeights.insert(std::make_pair(B, 0)); + BlockWeights.insert(std::make_pair(BB, 0)); if (!Entry.second) return Entry.first->second; - // Otherwise, compute and cache B's weight. + // Otherwise, compute and cache BB's weight. unsigned Weight = 0; - for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { - unsigned InstWeight = getInstWeight(*I); + for (auto &I : BB->getInstList()) { + unsigned InstWeight = getInstWeight(I); if (InstWeight > Weight) Weight = InstWeight; } @@ -588,13 +264,13 @@ unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) { /// the weights of every basic block in the CFG. /// /// \param F The function to query. -bool SampleFunctionProfile::computeBlockWeights(Function &F) { +bool SampleProfileLoader::computeBlockWeights(Function &F) { bool Changed = false; DEBUG(dbgs() << "Block weights\n"); - for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { - unsigned Weight = getBlockWeight(B); + for (auto &BB : F) { + unsigned Weight = getBlockWeight(&BB); Changed |= (Weight > 0); - DEBUG(printBlockWeight(dbgs(), B)); + DEBUG(printBlockWeight(dbgs(), &BB)); } return Changed; @@ -623,16 +299,13 @@ bool SampleFunctionProfile::computeBlockWeights(Function &F) { /// \param DomTree Opposite dominator tree. If \p Descendants is filled /// with blocks from \p BB1's dominator tree, then /// this is the post-dominator tree, and vice versa. -void SampleFunctionProfile::findEquivalencesFor( +void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, DominatorTreeBase<BasicBlock> *DomTree) { - for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(), - E = Descendants.end(); - I != E; ++I) { - BasicBlock *BB2 = *I; + for (auto *BB2 : Descendants) { bool IsDomParent = DomTree->dominates(BB2, BB1); bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); - if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent && + if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent && IsInSameLoop) { EquivalenceClass[BB2] = BB1; @@ -660,12 +333,12 @@ void SampleFunctionProfile::findEquivalencesFor( /// dominates B2, B2 post-dominates B1 and both are in the same loop. /// /// \param F The function to query. -void SampleFunctionProfile::findEquivalenceClasses(Function &F) { +void SampleProfileLoader::findEquivalenceClasses(Function &F) { SmallVector<BasicBlock *, 8> DominatedBBs; DEBUG(dbgs() << "\nBlock equivalence classes\n"); // Find equivalence sets based on dominance and post-dominance information. - for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { - BasicBlock *BB1 = B; + for (auto &BB : F) { + BasicBlock *BB1 = &BB; // Compute BB1's equivalence class once. if (EquivalenceClass.count(BB1)) { @@ -712,8 +385,8 @@ void SampleFunctionProfile::findEquivalenceClasses(Function &F) { // each equivalence class has the largest weight, assign that weight // to all the blocks in that equivalence class. DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); - for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) { - BasicBlock *BB = B; + for (auto &BI : F) { + BasicBlock *BB = &BI; BasicBlock *EquivBB = EquivalenceClass[BB]; if (BB != EquivBB) BlockWeights[BB] = BlockWeights[EquivBB]; @@ -731,8 +404,8 @@ void SampleFunctionProfile::findEquivalenceClasses(Function &F) { /// \param UnknownEdge Set if E has not been visited before. /// /// \returns E's weight, if known. Otherwise, return 0. -unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges, - Edge *UnknownEdge) { +unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, + Edge *UnknownEdge) { if (!VisitedEdges.count(E)) { (*NumUnknownEdges)++; *UnknownEdge = E; @@ -753,11 +426,11 @@ unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges, /// \param F Function to process. /// /// \returns True if new weights were assigned to edges or blocks. -bool SampleFunctionProfile::propagateThroughEdges(Function &F) { +bool SampleProfileLoader::propagateThroughEdges(Function &F) { bool Changed = false; DEBUG(dbgs() << "\nPropagation through edges\n"); - for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) { - BasicBlock *BB = BI; + for (auto &BI : F) { + BasicBlock *BB = &BI; // Visit all the predecessor and successor edges to determine // which ones have a weight assigned already. Note that it doesn't @@ -771,16 +444,16 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) { if (i == 0) { // First, visit all predecessor edges. - for (size_t I = 0; I < Predecessors[BB].size(); I++) { - Edge E = std::make_pair(Predecessors[BB][I], BB); + for (auto *Pred : Predecessors[BB]) { + Edge E = std::make_pair(Pred, BB); TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); if (E.first == E.second) SelfReferentialEdge = E; } } else { // On the second round, visit all successor edges. - for (size_t I = 0; I < Successors[BB].size(); I++) { - Edge E = std::make_pair(BB, Successors[BB][I]); + for (auto *Succ : Successors[BB]) { + Edge E = std::make_pair(BB, Succ); TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); } } @@ -821,7 +494,7 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) { << " known. Set weight for block: "; printBlockWeight(dbgs(), BB);); } - if (VisitedBlocks.insert(BB)) + if (VisitedBlocks.insert(BB).second) Changed = true; } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { // If there is a single unknown edge and the block has been @@ -857,9 +530,9 @@ bool SampleFunctionProfile::propagateThroughEdges(Function &F) { /// /// We are interested in unique edges. If a block B1 has multiple /// edges to another block B2, we only add a single B1->B2 edge. -void SampleFunctionProfile::buildEdges(Function &F) { - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *B1 = I; +void SampleProfileLoader::buildEdges(Function &F) { + for (auto &BI : F) { + BasicBlock *B1 = &BI; // Add predecessors for B1. SmallPtrSet<BasicBlock *, 16> Visited; @@ -867,7 +540,7 @@ void SampleFunctionProfile::buildEdges(Function &F) { llvm_unreachable("Found a stale predecessors list in a basic block."); for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { BasicBlock *B2 = *PI; - if (Visited.insert(B2)) + if (Visited.insert(B2).second) Predecessors[B1].push_back(B2); } @@ -877,7 +550,7 @@ void SampleFunctionProfile::buildEdges(Function &F) { llvm_unreachable("Found a stale successors list in a basic block."); for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { BasicBlock *B2 = *SI; - if (Visited.insert(B2)) + if (Visited.insert(B2).second) Successors[B1].push_back(B2); } } @@ -885,22 +558,22 @@ void SampleFunctionProfile::buildEdges(Function &F) { /// \brief Propagate weights into edges /// -/// The following rules are applied to every block B in the CFG: +/// The following rules are applied to every block BB in the CFG: /// -/// - If B has a single predecessor/successor, then the weight +/// - If BB has a single predecessor/successor, then the weight /// of that edge is the weight of the block. /// /// - If all incoming or outgoing edges are known except one, and the /// weight of the block is already known, the weight of the unknown /// edge will be the weight of the block minus the sum of all the known -/// edges. If the sum of all the known edges is larger than B's weight, +/// edges. If the sum of all the known edges is larger than BB's weight, /// we set the unknown edge weight to zero. /// /// - If there is a self-referential edge, and the weight of the block is /// known, the weight for that edge is set to the weight of the block /// minus the weight of the other incoming edges to that block (if /// known). -void SampleFunctionProfile::propagateWeights(Function &F) { +void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; unsigned i = 0; @@ -920,9 +593,9 @@ void SampleFunctionProfile::propagateWeights(Function &F) { // edge weights computed during propagation. DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); MDBuilder MDB(F.getContext()); - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *B = I; - TerminatorInst *TI = B->getTerminator(); + for (auto &BI : F) { + BasicBlock *BB = &BI; + TerminatorInst *TI = BB->getTerminator(); if (TI->getNumSuccessors() == 1) continue; if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) @@ -934,7 +607,7 @@ void SampleFunctionProfile::propagateWeights(Function &F) { bool AllWeightsZero = true; for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); - Edge E = std::make_pair(B, Succ); + Edge E = std::make_pair(BB, Succ); unsigned Weight = EdgeWeights[E]; DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); Weights.push_back(Weight); @@ -965,22 +638,17 @@ void SampleFunctionProfile::propagateWeights(Function &F) { /// /// \returns the line number where \p F is defined. If it returns 0, /// it means that there is no debug information available for \p F. -unsigned SampleFunctionProfile::getFunctionLoc(Function &F) { - NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); - if (CUNodes) { - for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) { - DICompileUnit CU(CUNodes->getOperand(I)); - DIArray Subprograms = CU.getSubprograms(); - for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) { - DISubprogram Subprogram(Subprograms.getElement(J)); - if (Subprogram.describes(&F)) - return Subprogram.getLineNumber(); - } - } - } +unsigned SampleProfileLoader::getFunctionLoc(Function &F) { + DISubprogram S = getDISubprogram(&F); + if (S.isSubprogram()) + return S.getLineNumber(); + // If could not find the start of \p F, emit a diagnostic to inform the user + // about the missed opportunity. F.getContext().diagnose(DiagnosticInfoSampleProfile( - "No debug information found in function " + F.getName())); + "No debug information found in function " + F.getName() + + ": Function profile not used", + DS_Warning)); return 0; } @@ -1002,15 +670,15 @@ unsigned SampleFunctionProfile::getFunctionLoc(Function &F) { /// /// 3- Propagation of block weights into edges. This uses a simple /// propagation heuristic. The following rules are applied to every -/// block B in the CFG: +/// block BB in the CFG: /// -/// - If B has a single predecessor/successor, then the weight +/// - If BB has a single predecessor/successor, then the weight /// of that edge is the weight of the block. /// /// - If all the edges are known except one, and the weight of the /// block is already known, the weight of the unknown edge will /// be the weight of the block minus the sum of all the known -/// edges. If the sum of all the known edges is larger than B's weight, +/// edges. If the sum of all the known edges is larger than BB's weight, /// we set the unknown edge weight to zero. /// /// - If there is a self-referential edge, and the weight of the block is @@ -1028,14 +696,12 @@ unsigned SampleFunctionProfile::getFunctionLoc(Function &F) { /// work here. /// /// Once all the branch weights are computed, we emit the MD_prof -/// metadata on B using the computed values for each of its branches. +/// metadata on BB using the computed values for each of its branches. /// /// \param F The function to query. /// /// \returns true if \p F was modified. Returns false, otherwise. -bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree, - PostDominatorTree *PostDomTree, - LoopInfo *Loops) { +bool SampleProfileLoader::emitAnnotations(Function &F) { bool Changed = false; // Initialize invariants used during computation and propagation. @@ -1045,10 +711,6 @@ bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree, DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() << ": " << HeaderLineno << "\n"); - DT = DomTree; - PDT = PostDomTree; - LI = Loops; - Ctx = &F.getParent()->getContext(); // Compute basic block weights. Changed |= computeBlockWeights(F); @@ -1075,8 +737,14 @@ INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) bool SampleProfileLoader::doInitialization(Module &M) { - Profiler.reset(new SampleModuleProfile(M, Filename)); - ProfileIsValid = Profiler->loadText(); + auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext()); + if (std::error_code EC = ReaderOrErr.getError()) { + std::string Msg = "Could not open profile: " + EC.message(); + M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); + return false; + } + Reader = std::move(ReaderOrErr.get()); + ProfileIsValid = (Reader->read() == sampleprof_error::success); return true; } @@ -1091,11 +759,13 @@ FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { bool SampleProfileLoader::runOnFunction(Function &F) { if (!ProfileIsValid) return false; - DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>(); - LoopInfo *LI = &getAnalysis<LoopInfo>(); - SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F); - if (!FunctionProfile.empty()) - return FunctionProfile.emitAnnotations(F, DT, PDT, LI); + + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PDT = &getAnalysis<PostDominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + Ctx = &F.getParent()->getContext(); + Samples = Reader->getSamplesFor(F); + if (!Samples->empty()) + return emitAnnotations(F); return false; } diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index de724d419a48..a16e9e29a1f1 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -28,6 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); + initializeAlignmentFromAssumptionsPass(Registry); initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); @@ -38,6 +39,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeDSEPass(Registry); initializeGVNPass(Registry); initializeEarlyCSEPass(Registry); + initializeFlattenCFGPassPass(Registry); initializeIndVarSimplifyPass(Registry); initializeJumpThreadingPass(Registry); initializeLICMPass(Registry); @@ -77,6 +79,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createAlignmentFromAssumptionsPass()); +} + void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createCFGSimplificationPass()); } @@ -145,6 +151,10 @@ void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPartiallyInlineLibCallsPass()); } +void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerSwitchPass()); +} + void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPromoteMemoryToRegisterPass()); } @@ -203,6 +213,10 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); } +void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScopedNoAliasAAPass()); +} + void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBasicAliasAnalysisPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index e2a24a7fd4a7..5c49a5504b47 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" @@ -197,6 +198,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -214,6 +216,7 @@ namespace { // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.setPreservesCFG(); } }; @@ -225,12 +228,14 @@ char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) @@ -1063,12 +1068,14 @@ public: void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; - if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); + if (auto *L = LocalAsMetadata::getIfExists(AI)) { + if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) { + for (User *U : DebugNode->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) + DVIs.push_back(DVI); + } } LoadAndStorePromoter::run(Insts); @@ -1119,9 +1126,9 @@ public: } else { continue; } - Instruction *DbgVal = - DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), - Inst); + Instruction *DbgVal = DIB->insertDbgValueIntrinsic( + Arg, 0, DIVariable(DVI->getVariable()), + DIExpression(DVI->getExpression()), Inst); DbgVal->setDebugLoc(DVI->getDebugLoc()); } } @@ -1333,12 +1340,15 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { LoadInst *FalseLoad = Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); - // Transfer alignment and TBAA info if present. + // Transfer alignment and AA info if present. TrueLoad->setAlignment(LI->getAlignment()); FalseLoad->setAlignment(LI->getAlignment()); - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { - TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); - FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + + AAMDNodes Tags; + LI->getAAMetadata(Tags); + if (Tags) { + TrueLoad->setAAMetadata(Tags); + FalseLoad->setAAMetadata(Tags); } Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); @@ -1364,10 +1374,12 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); - // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // Get the AA tags and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ, it doesn't matter. LoadInst *SomeLoad = cast<LoadInst>(PN->user_back()); - MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + + AAMDNodes AATags; + SomeLoad->getAAMetadata(AATags); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. @@ -1389,7 +1401,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { PN->getName() + "." + Pred->getName(), Pred->getTerminator()); Load->setAlignment(Align); - if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + if (AATags) Load->setAAMetadata(AATags); } NewPN->addIncoming(Load, Pred); @@ -1407,9 +1419,11 @@ bool SROA::performPromotion(Function &F) { DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function - DIBuilder DIB(*F.getParent()); + DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); bool Changed = false; SmallVector<Instruction*, 64> Insts; while (1) { @@ -1425,7 +1439,7 @@ bool SROA::performPromotion(Function &F) { if (Allocas.empty()) break; if (HasDomTree) - PromoteMemToReg(Allocas, *DT); + PromoteMemToReg(Allocas, *DT, nullptr, &AC); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { @@ -1658,7 +1672,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { // If we've already checked this PHI, don't do it again. if (PHINode *PN = dyn_cast<PHINode>(I)) - if (!Info.CheckedPHIs.insert(PN)) + if (!Info.CheckedPHIs.insert(PN).second) return; for (User *U : I->users()) { diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp index 7a73f113b1d9..6036c099be0e 100644 --- a/lib/Transforms/Scalar/Scalarizer.cpp +++ b/lib/Transforms/Scalar/Scalarizer.cpp @@ -150,6 +150,16 @@ public: bool visitLoadInst(LoadInst &); bool visitStoreInst(StoreInst &); + static void registerOptions() { + // This is disabled by default because having separate loads and stores + // makes it more likely that the -combiner-alias-analysis limits will be + // reached. + OptionRegistry::registerOption<bool, Scalarizer, + &Scalarizer::ScalarizeLoadStore>( + "scalarize-load-store", + "Allow the scalarizer pass to scalarize loads and store", false); + } + private: Scatterer scatter(Instruction *, Value *); void gather(Instruction *, const ValueVector &); @@ -164,19 +174,14 @@ private: GatherList Gathered; unsigned ParallelLoopAccessMDKind; const DataLayout *DL; + bool ScalarizeLoadStore; }; char Scalarizer::ID = 0; } // end anonymous namespace -// This is disabled by default because having separate loads and stores makes -// it more likely that the -combiner-alias-analysis limits will be reached. -static cl::opt<bool> ScalarizeLoadStore - ("scalarize-load-store", cl::Hidden, cl::init(false), - cl::desc("Allow the scalarizer pass to scalarize loads and store")); - -INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations", - false, false) +INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer", + "Scalarize vector operations", false, false) Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, ValueVector *cachePtr) @@ -236,7 +241,9 @@ Value *Scatterer::operator[](unsigned I) { bool Scalarizer::doInitialization(Module &M) { ParallelLoopAccessMDKind = - M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); + ScalarizeLoadStore = + M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>(); return false; } @@ -312,6 +319,8 @@ bool Scalarizer::canTransferMetadata(unsigned Tag) { || Tag == LLVMContext::MD_fpmath || Tag == LLVMContext::MD_tbaa_struct || Tag == LLVMContext::MD_invariant_load + || Tag == LLVMContext::MD_alias_scope + || Tag == LLVMContext::MD_noalias || Tag == ParallelLoopAccessMDKind); } @@ -322,8 +331,10 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { Op->getAllMetadataOtherThanDebugLoc(MDs); for (unsigned I = 0, E = CV.size(); I != E; ++I) { if (Instruction *New = dyn_cast<Instruction>(CV[I])) { - for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator - MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) + for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator + MI = MDs.begin(), + ME = MDs.end(); + MI != ME; ++MI) if (canTransferMetadata(MI->first)) New->setMetadata(MI->first, MI->second); New->setDebugLoc(Op->getDebugLoc()); diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6557ce4575dd..6157746af48c 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -79,6 +79,81 @@ // ld.global.f32 %f3, [%rl6+128]; // much better // ld.global.f32 %f4, [%rl6+132]; // much better // +// Another improvement enabled by the LowerGEP flag is to lower a GEP with +// multiple indices to either multiple GEPs with a single index or arithmetic +// operations (depending on whether the target uses alias analysis in codegen). +// Such transformation can have following benefits: +// (1) It can always extract constants in the indices of structure type. +// (2) After such Lowering, there are more optimization opportunities such as +// CSE, LICM and CGP. +// +// E.g. The following GEPs have multiple indices: +// BB1: +// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 +// load %p +// ... +// BB2: +// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 +// load %p2 +// ... +// +// We can not do CSE for to the common part related to index "i64 %i". Lowering +// GEPs can achieve such goals. +// If the target does not use alias analysis in codegen, this pass will +// lower a GEP with multiple indices into arithmetic operations: +// BB1: +// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity +// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %3 = add i64 %1, %2 ; CSE opportunity +// %4 = mul i64 %j1, length_of_struct +// %5 = add i64 %3, %4 +// %6 = add i64 %3, struct_field_3 ; Constant offset +// %p = inttoptr i64 %6 to i32* +// load %p +// ... +// BB2: +// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity +// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %9 = add i64 %7, %8 ; CSE opportunity +// %10 = mul i64 %j2, length_of_struct +// %11 = add i64 %9, %10 +// %12 = add i64 %11, struct_field_2 ; Constant offset +// %p = inttoptr i64 %12 to i32* +// load %p2 +// ... +// +// If the target uses alias analysis in codegen, this pass will lower a GEP +// with multiple indices into multiple GEPs with a single index: +// BB1: +// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity +// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity +// %4 = mul i64 %j1, length_of_struct +// %5 = getelementptr i8* %3, i64 %4 +// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset +// %p = bitcast i8* %6 to i32* +// load %p +// ... +// BB2: +// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity +// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity +// %10 = mul i64 %j2, length_of_struct +// %11 = getelementptr i8* %9, i64 %10 +// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset +// %p2 = bitcast i8* %12 to i32* +// load %p2 +// ... +// +// Lowering GEPs can also benefit other passes such as LICM and CGP. +// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple +// indices if one of the index is variant. If we lower such GEP into invariant +// parts and variant parts, LICM can hoist/sink those invariant parts. +// CGP (CodeGen Prepare) tries to sink address calculations that match the +// target's addressing modes. A GEP with multiple indices may not match and will +// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of +// them. So we end up with a better addressing mode. +// //===----------------------------------------------------------------------===// #include "llvm/Analysis/TargetTransformInfo.h" @@ -92,6 +167,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" using namespace llvm; @@ -117,18 +195,17 @@ namespace { /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). class ConstantOffsetExtractor { public: - /// Extracts a constant offset from the given GEP index. It outputs the - /// numeric value of the extracted constant offset (0 if failed), and a + /// Extracts a constant offset from the given GEP index. It returns the /// new index representing the remainder (equal to the original index minus - /// the constant offset). + /// the constant offset), or nullptr if we cannot extract a constant offset. /// \p Idx The given GEP index - /// \p NewIdx The new index to replace (output) /// \p DL The datalayout of the module /// \p GEP The given GEP - static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL, - GetElementPtrInst *GEP); - /// Looks for a constant offset without extracting it. The meaning of the - /// arguments and the return value are the same as Extract. + static Value *Extract(Value *Idx, const DataLayout *DL, + GetElementPtrInst *GEP); + /// Looks for a constant offset from the given GEP index without extracting + /// it. It returns the numeric value of the extracted constant offset (0 if + /// failed). The meaning of the arguments are the same as Extract. static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); private: @@ -228,7 +305,9 @@ class ConstantOffsetExtractor { class SeparateConstOffsetFromGEP : public FunctionPass { public: static char ID; - SeparateConstOffsetFromGEP() : FunctionPass(ID) { + SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, + bool LowerGEP = false) + : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); } @@ -251,10 +330,29 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// Tries to split the given GEP into a variadic base and a constant offset, /// and returns true if the splitting succeeds. bool splitGEP(GetElementPtrInst *GEP); - /// Finds the constant offset within each index, and accumulates them. This - /// function only inspects the GEP without changing it. The output - /// NeedsExtraction indicates whether we can extract a non-zero constant - /// offset from any index. + /// Lower a GEP with multiple indices into multiple GEPs with a single index. + /// Function splitGEP already split the original GEP into a variadic part and + /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the + /// variadic part into a set of GEPs with a single index and applies + /// AccumulativeByteOffset to it. + /// \p Variadic The variadic part of the original GEP. + /// \p AccumulativeByteOffset The constant offset. + void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, + int64_t AccumulativeByteOffset); + /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. + /// Function splitGEP already split the original GEP into a variadic part and + /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the + /// variadic part into a set of arithmetic operations and applies + /// AccumulativeByteOffset to it. + /// \p Variadic The variadic part of the original GEP. + /// \p AccumulativeByteOffset The constant offset. + void lowerToArithmetics(GetElementPtrInst *Variadic, + int64_t AccumulativeByteOffset); + /// Finds the constant offset within each index and accumulates them. If + /// LowerGEP is true, it finds in indices of both sequential and structure + /// types, otherwise it only finds in sequential indices. The output + /// NeedsExtraction indicates whether we successfully find a non-zero constant + /// offset. int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); /// Canonicalize array indices to pointer-size integers. This helps to /// simplify the logic of splitting a GEP. For example, if a + b is a @@ -274,6 +372,10 @@ class SeparateConstOffsetFromGEP : public FunctionPass { bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); const DataLayout *DL; + const TargetMachine *TM; + /// Whether to lower a GEP with multiple indices into arithmetic operations or + /// multiple GEPs with a single index. + bool LowerGEP; }; } // anonymous namespace @@ -289,8 +391,10 @@ INITIALIZE_PASS_END( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() { - return new SeparateConstOffsetFromGEP(); +FunctionPass * +llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM, + bool LowerGEP) { + return new SeparateConstOffsetFromGEP(TM, LowerGEP); } bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, @@ -519,8 +623,13 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { // // Replacing the "or" with "add" is fine, because // a | (b + 5) = a + (b + 5) = (a + b) + 5 - return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1), - BO->getName(), IP); + if (OpNo == 0) { + return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(), + IP); + } else { + return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(), + IP); + } } // We can reuse BO in this case, because the new expression shares the same @@ -537,19 +646,17 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return BO; } -int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx, - const DataLayout *DL, - GetElementPtrInst *GEP) { +Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL, + GetElementPtrInst *GEP) { ConstantOffsetExtractor Extractor(DL, GEP); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, GEP->isInBounds()); - if (ConstantOffset != 0) { - // Separates the constant offset from the GEP index. - NewIdx = Extractor.rebuildWithoutConstOffset(); - } - return ConstantOffset.getSExtValue(); + if (ConstantOffset == 0) + return nullptr; + // Separates the constant offset from the GEP index. + return Extractor.rebuildWithoutConstOffset(); } int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, @@ -615,11 +722,116 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, AccumulativeByteOffset += ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); } + } else if (LowerGEP) { + StructType *StTy = cast<StructType>(*GTI); + uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue(); + // Skip field 0 as the offset is always 0. + if (Field != 0) { + NeedsExtraction = true; + AccumulativeByteOffset += + DL->getStructLayout(StTy)->getElementOffset(Field); + } } } return AccumulativeByteOffset; } +void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( + GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { + IRBuilder<> Builder(Variadic); + Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); + + Type *I8PtrTy = + Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); + Value *ResultPtr = Variadic->getOperand(0); + if (ResultPtr->getType() != I8PtrTy) + ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + + gep_type_iterator GTI = gep_type_begin(*Variadic); + // Create an ugly GEP for each sequential index. We don't create GEPs for + // structure indices, as they are accumulated in the constant offset index. + for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + Value *Idx = Variadic->getOperand(I); + // Skip zero indices. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) + if (CI->isZero()) + continue; + + APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), + DL->getTypeAllocSize(GTI.getIndexedType())); + // Scale the index by element size. + if (ElementSize != 1) { + if (ElementSize.isPowerOf2()) { + Idx = Builder.CreateShl( + Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); + } else { + Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); + } + } + // Create an ugly GEP with a single index for each index. + ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep"); + } + } + + // Create a GEP with the constant offset index. + if (AccumulativeByteOffset != 0) { + Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); + ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep"); + } + if (ResultPtr->getType() != Variadic->getType()) + ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); + + Variadic->replaceAllUsesWith(ResultPtr); + Variadic->eraseFromParent(); +} + +void +SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, + int64_t AccumulativeByteOffset) { + IRBuilder<> Builder(Variadic); + Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); + + Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); + gep_type_iterator GTI = gep_type_begin(*Variadic); + // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We + // don't create arithmetics for structure indices, as they are accumulated + // in the constant offset index. + for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + Value *Idx = Variadic->getOperand(I); + // Skip zero indices. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) + if (CI->isZero()) + continue; + + APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), + DL->getTypeAllocSize(GTI.getIndexedType())); + // Scale the index by element size. + if (ElementSize != 1) { + if (ElementSize.isPowerOf2()) { + Idx = Builder.CreateShl( + Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); + } else { + Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); + } + } + // Create an ADD for each index. + ResultPtr = Builder.CreateAdd(ResultPtr, Idx); + } + } + + // Create an ADD for the constant offset index. + if (AccumulativeByteOffset != 0) { + ResultPtr = Builder.CreateAdd( + ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); + } + + ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); + Variadic->replaceAllUsesWith(ResultPtr); + Variadic->eraseFromParent(); +} + bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Skip vector GEPs. if (GEP->getType()->isVectorTy()) @@ -637,32 +849,42 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (!NeedsExtraction) return Changed; - // Before really splitting the GEP, check whether the backend supports the - // addressing mode we are about to produce. If no, this splitting probably - // won't be beneficial. - TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), - /*BaseGV=*/nullptr, AccumulativeByteOffset, - /*HasBaseReg=*/true, /*Scale=*/0)) { - return Changed; + // If LowerGEP is disabled, before really splitting the GEP, check whether the + // backend supports the addressing mode we are about to produce. If no, this + // splitting probably won't be beneficial. + // If LowerGEP is enabled, even the extracted constant offset can not match + // the addressing mode, we can still do optimizations to other lowered parts + // of variable indices. Therefore, we don't check for addressing modes in that + // case. + if (!LowerGEP) { + TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0)) { + return Changed; + } } - // Remove the constant offset in each GEP index. The resultant GEP computes - // the variadic base. + // Remove the constant offset in each sequential index. The resultant GEP + // computes the variadic base. + // Notice that we don't remove struct field indices here. If LowerGEP is + // disabled, a structure index is not accumulated and we still use the old + // one. If LowerGEP is enabled, a structure index is accumulated in the + // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later + // handle the constant offset and won't need a new structure index. gep_type_iterator GTI = gep_type_begin(*GEP); for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (isa<SequentialType>(*GTI)) { - Value *NewIdx = nullptr; - // Tries to extract a constant offset from this GEP index. - int64_t ConstantOffset = - ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP); - if (ConstantOffset != 0) { - assert(NewIdx != nullptr && - "ConstantOffset != 0 implies NewIdx is set"); + // Splits this GEP index into a variadic part and a constant offset, and + // uses the variadic part as the new index. + Value *NewIdx = + ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP); + if (NewIdx != nullptr) { GEP->setOperand(I, NewIdx); } } } + // Clear the inbounds attribute because the new index may be off-bound. // e.g., // @@ -684,6 +906,21 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // possible. GEPs with inbounds are more friendly to alias analysis. GEP->setIsInBounds(false); + // Lowers a GEP to either GEPs with a single index or arithmetic operations. + if (LowerGEP) { + // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to + // arithmetic operations if the target uses alias analysis in codegen. + if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA()) + lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); + else + lowerToArithmetics(GEP, AccumulativeByteOffset); + return true; + } + + // No need to create another GEP if the accumulative byte offset is 0. + if (AccumulativeByteOffset == 0) + return true; + // Offsets the base with the accumulative byte offset. // // %gep ; the base @@ -715,16 +952,16 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Instruction *NewGEP = GEP->clone(); NewGEP->insertBefore(GEP); - uint64_t ElementTypeSizeOfGEP = - DL->getTypeAllocSize(GEP->getType()->getElementType()); + // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = + // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is + // used with unsigned integers later. + int64_t ElementTypeSizeOfGEP = static_cast<int64_t>( + DL->getTypeAllocSize(GEP->getType()->getElementType())); Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { // Very likely. As long as %gep is natually aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). - // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we - // cast ElementTypeSizeOfGEP to signed. - int64_t Index = - AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP); + int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; NewGEP = GetElementPtrInst::Create( NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); } else { diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 5d5606ba47b0..2e317f9d0999 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" @@ -34,22 +35,30 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "simplifycfg" +static cl::opt<unsigned> +UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1), + cl::desc("Control the number of bonus instructions (default = 1)")); + STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { struct CFGSimplifyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid - CFGSimplifyPass() : FunctionPass(ID) { + unsigned BonusInstThreshold; + CFGSimplifyPass(int T = -1) : FunctionPass(ID) { + BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfo>(); } }; @@ -59,12 +68,13 @@ char CFGSimplifyPass::ID = 0; INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, false) // Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass() { - return new CFGSimplifyPass(); +FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { + return new CFGSimplifyPass(Threshold); } /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi @@ -146,7 +156,8 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL) { + const DataLayout *DL, AssumptionCache *AC, + unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -155,7 +166,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, DL)) { + if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) { LocalChange = true; ++NumSimpl; } @@ -172,12 +183,14 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, DL); + EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -191,7 +204,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DL); + EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 7348c45c5d37..903b675fdd56 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -56,7 +56,7 @@ namespace { } private: bool ProcessBlock(BasicBlock &BB); - bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores); + bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores); bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const; }; @@ -157,7 +157,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { } static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, - SmallPtrSet<Instruction *, 8> &Stores) { + SmallPtrSetImpl<Instruction *> &Stores) { if (Inst->mayWriteToMemory()) { Stores.insert(Inst); @@ -166,9 +166,8 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { AliasAnalysis::Location Loc = AA->getLocation(L); - for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), - E = Stores.end(); I != E; ++I) - if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod) + for (Instruction *S : Stores) + if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod) return false; } @@ -220,7 +219,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool Sinking::SinkInstruction(Instruction *Inst, - SmallPtrSet<Instruction *, 8> &Stores) { + SmallPtrSetImpl<Instruction *> &Stores) { // Don't sink static alloca instructions. CodeGen assumes allocas outside the // entry block are dynamically sized stack objects. diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index b9673ed655e0..7fe87f9319b6 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -10,6 +10,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" @@ -166,6 +167,7 @@ class StructurizeCFG : public RegionPass { Region *ParentRegion; DominatorTree *DT; + LoopInfo *LI; RNVector Order; BBSet Visited; @@ -247,6 +249,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfo>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } @@ -301,8 +304,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { BasicBlock *Succ = Term->getSuccessor(i); - if (Visited.count(Succ)) + if (Visited.count(Succ) && LI->isLoopHeader(Succ) ) { Loops[Succ] = BB; + } } } } @@ -862,6 +866,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ParentRegion = R; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfo>(); orderNodes(); collectInfos(); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index b7580255150c..f3c3e3054b60 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -86,6 +87,7 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); namespace { struct TailCallElim : public FunctionPass { const TargetTransformInfo *TTI; + const DataLayout *DL; static char ID; // Pass identification, replacement for typeid TailCallElim() : FunctionPass(ID) { @@ -157,6 +159,8 @@ bool TailCallElim::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; + DL = F.getParent()->getDataLayout(); + bool AllCallsAreTailCalls = false; bool Modified = markTails(F, AllCallsAreTailCalls); if (AllCallsAreTailCalls) @@ -175,7 +179,7 @@ struct AllocaDerivedValueTracker { auto AddUsesToWorklist = [&](Value *V) { for (auto &U : V->uses()) { - if (!Visited.insert(&U)) + if (!Visited.insert(&U).second) continue; Worklist.push_back(&U); } @@ -400,18 +404,28 @@ bool TailCallElim::runTRE(Function &F) { // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. + SmallVector<BasicBlock*, 8> BBToErase; for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) { Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); + // FoldReturnAndProcessPred may have emptied some BB. Remember to + // erase them. + if (Change && BB->empty()) + BBToErase.push_back(BB); + + } MadeChange |= Change; } } + for (auto BB: BBToErase) + BB->eraseFromParent(); + // If we eliminated any tail recursions, it's possible that we inserted some // silly PHI nodes which just merge an initial value (the incoming operand) // with themselves. Check to see if we did and clean up our mess if so. This @@ -450,7 +464,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // being loaded from. if (CI->mayWriteToMemory() || !isSafeToLoadUnconditionally(L->getPointerOperand(), L, - L->getAlignment())) + L->getAlignment(), DL)) return false; } } @@ -819,8 +833,20 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){ DEBUG(dbgs() << "FOLDING: " << *BB << "INTO UNCOND BRANCH PRED: " << *Pred); - EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred), - OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, + ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); + + // Cleanup: if all predecessors of BB have been eliminated by + // FoldReturnIntoUncondBranch, we would like to delete it, but we + // can not just nuke it as it is being used as an iterator by our caller. + // Just empty it, and the caller will erase it when it is safe to do so. + // It is important to empty it, because the ret instruction in there is + // still using a value which EliminateRecursiveTailCall will attempt + // to remove. + if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) + BB->getInstList().clear(); + + EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, CannotTailCallElimCallsMarkedTail); ++NumRetDuped; Change = true; |