diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 |
commit | ab44ce3d598882e51a25eb82eb7ae6308de85ae6 (patch) | |
tree | 568d786a59d49bef961dcb9bd09d422701b9da5b /lib/Transforms | |
parent | b5630dbadf9a2a06754194387d6b0fd9962a67f1 (diff) |
Notes
Diffstat (limited to 'lib/Transforms')
40 files changed, 1638 insertions, 330 deletions
diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp index a97db6fde454..5cf2a8c25d83 100644 --- a/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -124,6 +124,7 @@ struct CoroCleanup : FunctionPass { if (!L) AU.setPreservesAll(); } + StringRef getPassName() const override { return "Coroutine Cleanup"; } }; } diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp index e8bb0ca99d8a..b52989186165 100644 --- a/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/lib/Transforms/Coroutines/CoroEarly.cpp @@ -208,6 +208,9 @@ struct CoroEarly : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } + StringRef getPassName() const override { + return "Lower early coroutine intrinsics"; + } }; } diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp index c6ac3f614ff7..acb22449142b 100644 --- a/lib/Transforms/Coroutines/CoroElide.cpp +++ b/lib/Transforms/Coroutines/CoroElide.cpp @@ -301,6 +301,7 @@ struct CoroElide : FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); } + StringRef getPassName() const override { return "Coroutine Elision"; } }; } diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 417d57f7625b..85e9003ec3c5 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -799,9 +799,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { splitAround(CSI, "CoroSuspend"); } - // Put fallthrough CoroEnd into its own block. Note: Shape::buildFrom places - // the fallthrough coro.end as the first element of CoroEnds array. - splitAround(Shape.CoroEnds.front(), "CoroEnd"); + // Put CoroEnds into their own blocks. + for (CoroEndInst *CE : Shape.CoroEnds) + splitAround(CE, "CoroEnd"); // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will // never has its definition separated from the PHI by the suspend point. @@ -813,19 +813,24 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { IRBuilder<> Builder(F.getContext()); SpillInfo Spills; - // See if there are materializable instructions across suspend points. - for (Instruction &I : instructions(F)) - if (materializable(I)) - for (User *U : I.users()) - if (Checker.isDefinitionAcrossSuspend(I, U)) - Spills.emplace_back(&I, U); + for (int Repeat = 0; Repeat < 4; ++Repeat) { + // See if there are materializable instructions across suspend points. + for (Instruction &I : instructions(F)) + if (materializable(I)) + for (User *U : I.users()) + if (Checker.isDefinitionAcrossSuspend(I, U)) + Spills.emplace_back(&I, U); - // Rewrite materializable instructions to be materialized at the use point. - DEBUG(dump("Materializations", Spills)); - rewriteMaterializableInstructions(Builder, Spills); + if (Spills.empty()) + break; + + // Rewrite materializable instructions to be materialized at the use point. + DEBUG(dump("Materializations", Spills)); + rewriteMaterializableInstructions(Builder, Spills); + Spills.clear(); + } // Collect the spills for arguments and other not-materializable values. - Spills.clear(); for (Argument &A : F.args()) for (User *U : A.users()) if (Checker.isDefinitionAcrossSuspend(A, U)) @@ -847,8 +852,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { if (I.getType()->isTokenTy()) report_fatal_error( "token definition is separated from the use by a suspend point"); - assert(!materializable(I) && - "rewriteMaterializable did not do its job"); Spills.emplace_back(&I, U); } } diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index 12eb16789825..cd549e4be282 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -228,15 +228,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, SmallVector<ReturnInst *, 4> Returns; - if (DISubprogram *SP = F.getSubprogram()) { - // If we have debug info, add mapping for the metadata nodes that should not - // be cloned by CloneFunctionInfo. - auto &MD = VMap.MD(); - MD[SP->getUnit()].reset(SP->getUnit()); - MD[SP->getType()].reset(SP->getType()); - MD[SP->getFile()].reset(SP->getFile()); - } - CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns); + CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/false, Returns); // Remove old returns. for (ReturnInst *Return : Returns) @@ -509,12 +501,87 @@ static void simplifySuspendPoints(coro::Shape &Shape) { S.resize(N); } +static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) { + // Collect all blocks that we need to look for instructions to relocate. + SmallPtrSet<BasicBlock *, 4> RelocBlocks; + SmallVector<BasicBlock *, 4> Work; + Work.push_back(CB->getParent()); + + do { + BasicBlock *Current = Work.pop_back_val(); + for (BasicBlock *BB : predecessors(Current)) + if (RelocBlocks.count(BB) == 0) { + RelocBlocks.insert(BB); + Work.push_back(BB); + } + } while (!Work.empty()); + return RelocBlocks; +} + +static SmallPtrSet<Instruction *, 8> +getNotRelocatableInstructions(CoroBeginInst *CoroBegin, + SmallPtrSetImpl<BasicBlock *> &RelocBlocks) { + SmallPtrSet<Instruction *, 8> DoNotRelocate; + // Collect all instructions that we should not relocate + SmallVector<Instruction *, 8> Work; + + // Start with CoroBegin and terminators of all preceding blocks. + Work.push_back(CoroBegin); + BasicBlock *CoroBeginBB = CoroBegin->getParent(); + for (BasicBlock *BB : RelocBlocks) + if (BB != CoroBeginBB) + Work.push_back(BB->getTerminator()); + + // For every instruction in the Work list, place its operands in DoNotRelocate + // set. + do { + Instruction *Current = Work.pop_back_val(); + DoNotRelocate.insert(Current); + for (Value *U : Current->operands()) { + auto *I = dyn_cast<Instruction>(U); + if (!I) + continue; + if (isa<AllocaInst>(U)) + continue; + if (DoNotRelocate.count(I) == 0) { + Work.push_back(I); + DoNotRelocate.insert(I); + } + } + } while (!Work.empty()); + return DoNotRelocate; +} + +static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) { + // Analyze which non-alloca instructions are needed for allocation and + // relocate the rest to after coro.begin. We need to do it, since some of the + // targets of those instructions may be placed into coroutine frame memory + // for which becomes available after coro.begin intrinsic. + + auto BlockSet = getCoroBeginPredBlocks(CoroBegin); + auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet); + + Instruction *InsertPt = CoroBegin->getNextNode(); + BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well. + for (auto B = BB.begin(), E = BB.end(); B != E;) { + Instruction &I = *B++; + if (isa<AllocaInst>(&I)) + continue; + if (&I == CoroBegin) + break; + if (DoNotRelocateSet.count(&I)) + continue; + I.moveBefore(InsertPt); + } +} + static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) { coro::Shape Shape(F); if (!Shape.CoroBegin) return; simplifySuspendPoints(Shape); + relocateInstructionBefore(Shape.CoroBegin, F); buildCoroutineFrame(F, Shape); replaceFrameSize(Shape); @@ -660,6 +727,7 @@ struct CoroSplit : public CallGraphSCCPass { void getAnalysisUsage(AnalysisUsage &AU) const override { CallGraphSCCPass::getAnalysisUsage(AU); } + StringRef getPassName() const override { return "Coroutine Splitting"; } }; } diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 8dff2fb3be8a..4c417f1c55eb 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -558,17 +558,17 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( std::vector<User *> Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); Function *CurrentCaller = nullptr; + std::unique_ptr<BlockFrequencyInfo> TempBFI; BlockFrequencyInfo *CurrentCallerBFI = nullptr; auto ComputeCurrBFI = [&,this](Function *Caller) { // For the old pass manager: if (!GetBFI) { - if (CurrentCallerBFI) - delete CurrentCallerBFI; DominatorTree DT(*Caller); LoopInfo LI(DT); BranchProbabilityInfo BPI(*Caller, LI); - CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); + TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); + CurrentCallerBFI = TempBFI.get(); } else { // New pass manager: CurrentCallerBFI = &(*GetBFI)(*Caller); @@ -591,10 +591,6 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( else CallSiteToProfCountMap[User] = 0; } - if (!GetBFI) { - if (CurrentCallerBFI) - delete CurrentCallerBFI; - } } Function *PartialInlinerImpl::unswitchFunction(Function *F) { diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index ec06d5f9fb05..9fd3a9021a27 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -155,6 +155,10 @@ static cl::opt<bool> cl::Hidden, cl::desc("Enable the simple loop unswitch pass.")); +static cl::opt<bool> EnableGVNSink( + "enable-gvn-sink", cl::init(false), cl::Hidden, + cl::desc("Enable the GVN sinking pass (default = on)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -307,6 +311,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createEarlyCSEPass()); // Catch trivial redundancies if (EnableGVNHoist) MPM.add(createGVNHoistPass()); + if (EnableGVNSink) { + MPM.add(createGVNSinkPass()); + MPM.add(createCFGSimplificationPass()); + } + // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); MPM.add(createJumpThreadingPass()); // Thread jumps. @@ -904,6 +913,12 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (OptLevel != 0) addLTOOptimizationPasses(PM); + else { + // The whole-program-devirt pass needs to run at -O0 because only it knows + // about the llvm.type.checked.load intrinsic: it needs to both lower the + // intrinsic itself and handle it in the summary. + PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); + } // Create a function that performs CFI checks for cross-DSO calls with targets // in the current module. diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 733eeb1767a3..7204bf517681 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -861,12 +861,9 @@ bool InstCombiner::willNotOverflowSignedSub(const Value *LHS, ComputeNumSignBits(RHS, 0, &CxtI) > 1) return true; - unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - KnownBits LHSKnown(BitWidth); - computeKnownBits(LHS, LHSKnown, 0, &CxtI); + KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI); - KnownBits RHSKnown(BitWidth); - computeKnownBits(RHS, RHSKnown, 0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI); // Subtraction of two 2's complement numbers having identical signs will // never overflow. @@ -1059,9 +1056,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // If this is a xor that was canonicalized from a sub, turn it back into // a sub and fuse this add with it. if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) { - IntegerType *IT = cast<IntegerType>(I.getType()); - KnownBits LHSKnown(IT->getBitWidth()); - computeKnownBits(XorLHS, LHSKnown, 0, &I); + KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I); if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); @@ -1577,8 +1572,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known // zero. if (Op0C->isMask()) { - KnownBits RHSKnown(BitWidth); - computeKnownBits(Op1, RHSKnown, 0, &I); + KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); if ((*Op0C | RHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateXor(Op1, Op0); } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 4227b2d01be8..1f8319efb3be 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1610,17 +1610,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Value *Mask = nullptr; Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, &AC, CxtI, - &DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, &AC, CxtI, - &DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, &AC, - CxtI, &DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, &AC, - CxtI, &DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index face7abcc95f..92a38f26dde7 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1378,9 +1378,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { if (!IT) return nullptr; - unsigned BitWidth = IT->getBitWidth(); - KnownBits Known(BitWidth); - IC.computeKnownBits(Op0, Known, 0, &II); + KnownBits Known = IC.computeKnownBits(Op0, 0, &II); // Create a mask for bits above (ctlz) or below (cttz) the first known one. bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; @@ -1401,7 +1399,9 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // If the input to cttz/ctlz is known to be non-zero, // then change the 'ZeroIsUndef' parameter to 'true' // because we know the zero behavior can't affect the result. - if (Known.One != 0 || isKnownNonZero(Op0, IC.getDataLayout())) { + if (Known.One != 0 || + isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, + &IC.getDominatorTree())) { if (!match(II.getArgOperand(1), m_One())) { II.setOperand(1, IC.Builder->getTrue()); return &II; diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index f4bf5221f6a2..766939c56dff 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -692,8 +692,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // This only works for EQ and NE ICI->isEquality()) { // If Op1C some other power of two, convert: - KnownBits Known(Op1C->getType()->getBitWidth()); - computeKnownBits(ICI->getOperand(0), Known, 0, &CI); + KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? @@ -737,14 +736,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // may lead to additional simplifications. if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) { - uint32_t BitWidth = ITy->getBitWidth(); Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); - KnownBits KnownLHS(BitWidth); - KnownBits KnownRHS(BitWidth); - computeKnownBits(LHS, KnownLHS, 0, &CI); - computeKnownBits(RHS, KnownRHS, 0, &CI); + KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI); + KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI); if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { APInt KnownBits = KnownLHS.Zero | KnownLHS.One; @@ -1063,9 +1059,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { // the icmp and sext into bitwise/integer operations. if (ICI->hasOneUse() && ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ - unsigned BitWidth = Op1C->getType()->getBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Op0, Known, 0, &CI); + KnownBits Known = computeKnownBits(Op0, 0, &CI); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { @@ -1104,7 +1098,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { // Distribute the bit over the whole bit width. In = Builder->CreateAShr(In, ConstantInt::get(In->getType(), - BitWidth - 1), "sext"); + KnownZeroMask.getBitWidth() - 1), "sext"); } if (CI.getType() == In->getType()) diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 6492eaedae9c..2c2b7317a1c0 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1402,9 +1402,9 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { if (*C == 0 && Pred == ICmpInst::ICMP_SGT) { SelectPatternResult SPR = matchSelectPattern(X, A, B); if (SPR.Flavor == SPF_SMIN) { - if (isKnownPositive(A, DL)) + if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT)) return new ICmpInst(Pred, B, Cmp.getOperand(1)); - if (isKnownPositive(B, DL)) + if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT)) return new ICmpInst(Pred, A, Cmp.getOperand(1)); } } @@ -1478,8 +1478,7 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp, // of the high bits truncated out of x are known. unsigned DstBits = Trunc->getType()->getScalarSizeInBits(), SrcBits = X->getType()->getScalarSizeInBits(); - KnownBits Known(SrcBits); - computeKnownBits(X, Known, 0, &Cmp); + KnownBits Known = computeKnownBits(X, 0, &Cmp); // If all the high bits are known, we can do this xform. if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) { @@ -3030,18 +3029,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { break; case Instruction::Add: case Instruction::Sub: - case Instruction::Xor: + case Instruction::Xor: { if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); - // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b - if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) { - if (CI->getValue().isSignMask()) { + + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C))) { + // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b + if (C->isSignMask()) { ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } - if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { + // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b + if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) { ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); NewPred = I.getSwappedPredicate(NewPred); @@ -3049,26 +3051,30 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { } } break; - case Instruction::Mul: + } + case Instruction::Mul: { if (!I.isEquality()) break; - if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) { - // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask - // Mask = -1 >> count-trailing-zeros(Cst). - if (!CI->isZero() && !CI->isOne()) { - const APInt &AP = CI->getValue(); - ConstantInt *Mask = ConstantInt::get( - I.getContext(), - APInt::getLowBitsSet(AP.getBitWidth(), - AP.getBitWidth() - AP.countTrailingZeros())); + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C)) && *C != 0 && *C != 1) { + // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask) + // Mask = -1 >> count-trailing-zeros(C). + if (unsigned TZs = C->countTrailingZeros()) { + Constant *Mask = ConstantInt::get( + BO0->getType(), + APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs)); Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask); Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); return new ICmpInst(Pred, And1, And2); } + // If there are no trailing zeros in the multiplier, just eliminate + // the multiplies (no masking is needed): + // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); } break; - + } case Instruction::UDiv: case Instruction::LShr: if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) @@ -4497,7 +4503,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // if A is a power of 2. if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1, m_Zero()) && - isKnownToBeAPowerOfTwo(A, DL, false, 0, &AC, &I, &DT) && I.isEquality()) + isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality()) return new ICmpInst(I.getInversePredicate(), Builder->CreateAnd(A, B), Op1); diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 6829be86885b..56f133de3de1 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -540,6 +540,12 @@ public: return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); } + bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false, + unsigned Depth = 0, + const Instruction *CxtI = nullptr) { + return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT); + } + bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0, const Instruction *CxtI = nullptr) const { return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT); diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index fc13854f8fe7..4d408359eeea 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -47,9 +47,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, // inexact. Similarly for <<. BinaryOperator *I = dyn_cast<BinaryOperator>(V); if (I && I->isLogicalShift() && - isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0, - &IC.getAssumptionCache(), &CxtI, - &IC.getDominatorTree())) { + IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) { // We know that this is an exact/nuw shift and that the input is a // non-zero context as well. if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) { @@ -1240,7 +1238,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { return BO; } - if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) // Safe because the only negative value (1 << Y) can take on is // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have @@ -1487,7 +1485,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { I.getType()); // X urem Y -> X and Y-1, where Y is a power of 2, - if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { Constant *N1 = Constant::getAllOnesValue(I.getType()); Value *Add = Builder->CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 219effce7ba5..b40d067b2817 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -44,7 +44,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { Value *A; Constant *C; if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C)))) - if (isKnownNonNegative(A, DL) && isKnownNonNegative(C, DL)) + if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) && + isKnownNonNegative(C, DL, 0, &AC, &I, &DT)) return BinaryOperator::Create( I.getOpcode(), Builder->CreateBinOp(I.getOpcode(), Op0, C), A); diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 4028a92771a4..5df55f01b83f 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -158,8 +158,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 are known to be clear if zero in either the LHS | RHS. APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; @@ -192,8 +192,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 bits are only known if clear in both the LHS & RHS. APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; @@ -224,8 +224,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | @@ -313,8 +313,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(I, 1, DemandedMask) || @@ -325,15 +325,19 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Known.One = RHSKnown.One & LHSKnown.One; Known.Zero = RHSKnown.Zero & LHSKnown.Zero; break; + case Instruction::ZExt: case Instruction::Trunc: { - unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits(); - DemandedMask = DemandedMask.zext(truncBf); - Known = Known.zext(truncBf); - if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); + + APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth); + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) return I; - DemandedMask = DemandedMask.trunc(BitWidth); - Known = Known.trunc(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + Known = Known.zextOrTrunc(BitWidth); + // Any top bits are known to be zero. + if (BitWidth > SrcBitWidth) + Known.Zero.setBitsFrom(SrcBitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case Instruction::BitCast: @@ -355,56 +359,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); - break; - case Instruction::ZExt: { - // Compute the bits in the result that are not present in the input. - unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); - - DemandedMask = DemandedMask.trunc(SrcBitWidth); - Known = Known.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) - return I; - DemandedMask = DemandedMask.zext(BitWidth); - Known = Known.zext(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); - // The top bits are known to be zero. - Known.Zero.setBitsFrom(SrcBitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; - } case Instruction::SExt: { // Compute the bits in the result that are not present in the input. - unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); - APInt InputDemandedBits = DemandedMask & - APInt::getLowBitsSet(BitWidth, SrcBitWidth); + APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth); - APInt NewBits(APInt::getBitsSetFrom(BitWidth, SrcBitWidth)); // If any of the sign extended bits are demanded, we know that the sign // bit is demanded. - if ((NewBits & DemandedMask) != 0) + if (DemandedMask.getActiveBits() > SrcBitWidth) InputDemandedBits.setBit(SrcBitWidth-1); - InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); - Known = Known.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, InputDemandedBits, Known, Depth + 1)) + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1)) return I; - InputDemandedBits = InputDemandedBits.zext(BitWidth); - Known = Known.zext(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); - - // If the sign bit of the input is known set or clear, then we know the - // top bits of the result. // If the input sign bit is known zero, or if the NewBits are not demanded // convert this into a zero extension. - if (Known.Zero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { - // Convert to ZExt cast + if (InputKnown.isNonNegative() || + DemandedMask.getActiveBits() <= SrcBitWidth) { + // Convert to ZExt cast. CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); return InsertNewInstWith(NewCast, *I); - } else if (Known.One[SrcBitWidth-1]) { // Input sign bit known set - Known.One |= NewBits; - } + } + + // If the sign bit of the input is known set or clear, then we know the + // top bits of the result. + Known = InputKnown.sext(BitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case Instruction::Add: @@ -467,7 +451,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero <<= ShiftAmt; Known.One <<= ShiftAmt; // low bits known zero. @@ -491,7 +475,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShiftAmt); Known.One.lshrInPlace(ShiftAmt); if (ShiftAmt) @@ -535,7 +519,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); // Compute the new bits that are at the top now. APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); Known.Zero.lshrInPlace(ShiftAmt); @@ -590,7 +574,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One)) Known.One |= ~LowBits; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 7ed9fd566b37..2730afc5c5b9 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1963,6 +1963,7 @@ static bool isAllocSiteRemovable(Instruction *AI, // Give up the moment we see something we can't handle. return false; + case Instruction::AddrSpaceCast: case Instruction::BitCast: case Instruction::GetElementPtr: Users.emplace_back(I); @@ -2064,7 +2065,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { replaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()), C->isFalseWhenEqual())); - } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { + } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) || + isa<AddrSpaceCastInst>(I)) { replaceInstUsesWith(*I, UndefValue::get(I->getType())); } eraseInstFromFunction(*I); @@ -2180,8 +2182,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { // There might be assume intrinsics dominating this return that completely // determine the value. If so, constant fold it. - KnownBits Known(VTy->getPrimitiveSizeInBits()); - computeKnownBits(ResultOp, Known, 0, &RI); + KnownBits Known = computeKnownBits(ResultOp, 0, &RI); if (Known.isConstant()) RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant())); @@ -2242,9 +2243,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { return &SI; } - unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Cond, Known, 0, &SI); + KnownBits Known = computeKnownBits(Cond, 0, &SI); unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); @@ -2257,12 +2256,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes()); } - unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes); + unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes); // Shrink the condition operand if the new type is smaller than the old type. // This may produce a non-standard type for the switch, but that's ok because // the backend should extend back to a legal type for the target. - if (NewWidth > 0 && NewWidth < BitWidth) { + if (NewWidth > 0 && NewWidth < Known.getBitWidth()) { IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); Builder->SetInsertPoint(&SI); Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc"); @@ -2841,9 +2840,7 @@ bool InstCombiner::run() { // a value even when the operands are not all constants. Type *Ty = I->getType(); if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { - unsigned BitWidth = Ty->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(I, Known, /*Depth*/0, I); + KnownBits Known = computeKnownBits(I, /*Depth*/0, I); if (Known.isConstant()) { Constant *C = ConstantInt::get(Ty, Known.getConstant()); DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 990bcec109de..1e30dbf6b55a 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -180,7 +180,7 @@ static cl::opt<bool> static cl::opt<bool> PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden, cl::desc("Use this option to turn on/off " - "memory instrinsic size profiling.")); + "memory intrinsic size profiling.")); // Command line option to turn on CFG dot dump after profile annotation. // Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 4bc0a7133118..300085eccb0c 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -401,7 +401,10 @@ static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB, if (Options.NoPrune || &F.getEntryBlock() == BB) return true; - return !(isFullDominator(BB, DT) || isFullPostDominator(BB, PDT)); + // Do not instrument full dominators, or full post-dominators with multiple + // predecessors. + return !isFullDominator(BB, DT) + && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor()); } bool SanitizerCoverageModule::runOnFunction(Function &F) { diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 523390758769..f5196cc46181 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -13,6 +13,7 @@ add_llvm_library(LLVMScalarOpts GuardWidening.cpp GVN.cpp GVNHoist.cpp + GVNSink.cpp IVUsersPrinter.cpp InductiveRangeCheckElimination.cpp IndVarSimplify.cpp diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index f62e111460ca..c3810366bf22 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -164,9 +164,9 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, /// \brief Given \p BBs as input, find another set of BBs which collectively /// dominates \p BBs and have the minimal sum of frequencies. Return the BB /// set found in \p BBs. -void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, - BasicBlock *Entry, - SmallPtrSet<BasicBlock *, 8> &BBs) { +static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, + BasicBlock *Entry, + SmallPtrSet<BasicBlock *, 8> &BBs) { assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); // Nodes on the current path to the root. SmallPtrSet<BasicBlock *, 8> Path; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0490d93f6455..0d6e0538261d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; + bool commutative; SmallVector<uint32_t, 4> varargs; - Expression(uint32_t o = ~2U) : opcode(o) {} + Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {} bool operator==(const Expression &other) const { if (opcode != other.opcode) @@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); + e.commutative = true; } if (CmpInst *C = dyn_cast<CmpInst>(I)) { @@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; + e.commutative = true; } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; + e.commutative = true; return e; } @@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); + if (PHINode *PN = dyn_cast<PHINode>(V)) + NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) { - e = nextValueNumber++; - valueNumbering[C] = e; - return e; + auto ValNum = assignExpNewValueNum(exp); + if (ValNum.second) { + valueNumbering[C] = ValNum.first; + return ValNum.first; } if (!MD) { - e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } @@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; + case Instruction::PHI: + valueNumbering[V] = nextValueNumber; + NumberingPhi[nextValueNumber] = cast<PHINode>(V); + return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; + if (Verify) { + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; + } + return (VI != valueNumbering.end()) ? VI->second : 0; } /// Returns the value number of the given comparison, @@ -549,21 +559,29 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - return e; + return assignExpNewValueNum(exp).first; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); + NumberingPhi.clear(); + PhiTranslateTable.clear(); + BlockRPONumber.clear(); nextValueNumber = 1; + Expressions.clear(); + ExprIdx.clear(); + nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { + uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); + // If V is PHINode, V <--> value number is an one-to-one mapping. + if (isa<PHINode>(V)) + NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -1451,6 +1469,104 @@ bool GVN::processLoad(LoadInst *L) { return false; } +/// Return a pair the first field showing the value number of \p Exp and the +/// second field showing whether it is a value number newly created. +std::pair<uint32_t, bool> +GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { + uint32_t &e = expressionNumbering[Exp]; + bool CreateNewValNum = !e; + if (CreateNewValNum) { + Expressions.push_back(Exp); + if (ExprIdx.size() < nextValueNumber + 1) + ExprIdx.resize(nextValueNumber * 2); + e = nextValueNumber; + ExprIdx[nextValueNumber++] = nextExprNumber++; + } + return {e, CreateNewValNum}; +} + +void GVN::ValueTable::assignBlockRPONumber(Function &F) { + uint32_t NextBlockNumber = 1; + ReversePostOrderTraversal<Function *> RPOT(&F); + for (BasicBlock *BB : RPOT) + BlockRPONumber[BB] = NextBlockNumber++; +} + +/// Return whether all the values related with the same \p num are +/// defined in \p BB. +bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, + GVN &Gvn) { + LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; + while (Vals && Vals->BB == BB) + Vals = Vals->Next; + return !Vals; +} + +/// Wrap phiTranslateImpl to provide caching functionality. +uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, + const BasicBlock *PhiBlock, uint32_t Num, + GVN &Gvn) { + auto FindRes = PhiTranslateTable.find({Num, Pred}); + if (FindRes != PhiTranslateTable.end()) + return FindRes->second; + uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); + PhiTranslateTable.insert({{Num, Pred}, NewNum}); + return NewNum; +} + +/// Translate value number \p Num using phis, so that it has the values of +/// the phis in BB. +uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn) { + if (PHINode *PN = NumberingPhi[Num]) { + if (BlockRPONumber[Pred] >= BlockRPONumber[PhiBlock]) + return Num; + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) + if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) + return TransVal; + } + return Num; + } + + // If there is any value related with Num is defined in a BB other than + // PhiBlock, it cannot depend on a phi in PhiBlock without going through + // a backedge. We can do an early exit in that case to save compile time. + if (!areAllValsInBB(Num, PhiBlock, Gvn)) + return Num; + + if (ExprIdx[Num] == 0 || Num >= ExprIdx.size()) + return Num; + Expression Exp = Expressions[ExprIdx[Num]]; + + for (unsigned i = 0; i < Exp.varargs.size(); i++) { + // For InsertValue and ExtractValue, some varargs are index numbers + // instead of value numbers. Those index numbers should not be + // translated. + if ((i > 1 && Exp.opcode == Instruction::InsertValue) || + (i > 0 && Exp.opcode == Instruction::ExtractValue)) + continue; + Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); + } + + if (Exp.commutative) { + assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); + if (Exp.varargs[0] > Exp.varargs[1]) { + std::swap(Exp.varargs[0], Exp.varargs[1]); + uint32_t Opcode = Exp.opcode >> 8; + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) + Exp.opcode = (Opcode << 8) | + CmpInst::getSwappedPredicate( + static_cast<CmpInst::Predicate>(Exp.opcode & 255)); + } + } + + if (uint32_t NewNum = expressionNumbering[Exp]) + return NewNum; + return Num; +} + // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1856,6 +1972,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); + VN.assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -1945,7 +2062,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - if (Value *V = findLeader(Pred, VN.lookup(Op))) { + uint32_t TValNo = + VN.phiTranslate(Pred, Instr->getParent(), VN.lookup(Op), *this); + if (Value *V = findLeader(Pred, TValNo)) { Instr->setOperand(i, V); } else { success = false; @@ -1962,10 +2081,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - VN.add(Instr, ValNo); + + unsigned Num = VN.lookupOrAdd(Instr); + VN.add(Instr, Num); // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, Instr, Pred); + addToLeaderTable(Num, Instr, Pred); return true; } @@ -2014,7 +2135,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { break; } - Value *predV = findLeader(P, ValNo); + uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); + Value *predV = findLeader(P, TValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp new file mode 100644 index 000000000000..5c75f39e381d --- /dev/null +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -0,0 +1,872 @@ +//===- GVNSink.cpp - sink expressions into successors -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file GVNSink.cpp +/// This pass attempts to sink instructions into successors, reducing static +/// instruction count and enabling if-conversion. +/// +/// We use a variant of global value numbering to decide what can be sunk. +/// Consider: +/// +/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ] +/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ] +/// \ / +/// [ %e = phi i32 %a2, %c2 ] +/// [ add i32 %e, 4 ] +/// +/// +/// GVN would number %a1 and %c1 differently because they compute different +/// results - the VN of an instruction is a function of its opcode and the +/// transitive closure of its operands. This is the key property for hoisting +/// and CSE. +/// +/// What we want when sinking however is for a numbering that is a function of +/// the *uses* of an instruction, which allows us to answer the question "if I +/// replace %a1 with %c1, will it contribute in an equivalent way to all +/// successive instructions?". The PostValueTable class in GVN provides this +/// mapping. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include <unordered_set> +using namespace llvm; + +#define DEBUG_TYPE "gvn-sink" + +STATISTIC(NumRemoved, "Number of instructions removed"); + +namespace { + +static bool isMemoryInst(const Instruction *I) { + return isa<LoadInst>(I) || isa<StoreInst>(I) || + (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) || + (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory()); +} + +/// Iterates through instructions in a set of blocks in reverse order from the +/// first non-terminator. For example (assume all blocks have size n): +/// LockstepReverseIterator I([B1, B2, B3]); +/// *I-- = [B1[n], B2[n], B3[n]]; +/// *I-- = [B1[n-1], B2[n-1], B3[n-1]]; +/// *I-- = [B1[n-2], B2[n-2], B3[n-2]]; +/// ... +/// +/// It continues until all blocks have been exhausted. Use \c getActiveBlocks() +/// to +/// determine which blocks are still going and the order they appear in the +/// list returned by operator*. +class LockstepReverseIterator { + ArrayRef<BasicBlock *> Blocks; + SmallPtrSet<BasicBlock *, 4> ActiveBlocks; + SmallVector<Instruction *, 4> Insts; + bool Fail; + +public: + LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) { + reset(); + } + + void reset() { + Fail = false; + ActiveBlocks.clear(); + for (BasicBlock *BB : Blocks) + ActiveBlocks.insert(BB); + Insts.clear(); + for (BasicBlock *BB : Blocks) { + if (BB->size() <= 1) { + // Block wasn't big enough - only contained a terminator. + ActiveBlocks.erase(BB); + continue; + } + Insts.push_back(BB->getTerminator()->getPrevNode()); + } + if (Insts.empty()) + Fail = true; + } + + bool isValid() const { return !Fail; } + ArrayRef<Instruction *> operator*() const { return Insts; } + SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; } + + void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) { + for (auto II = Insts.begin(); II != Insts.end();) { + if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == + Blocks.end()) { + ActiveBlocks.erase((*II)->getParent()); + II = Insts.erase(II); + } else { + ++II; + } + } + } + + void operator--() { + if (Fail) + return; + SmallVector<Instruction *, 4> NewInsts; + for (auto *Inst : Insts) { + if (Inst == &Inst->getParent()->front()) + ActiveBlocks.erase(Inst->getParent()); + else + NewInsts.push_back(Inst->getPrevNode()); + } + if (NewInsts.empty()) { + Fail = true; + return; + } + Insts = NewInsts; + } +}; + +//===----------------------------------------------------------------------===// + +/// Candidate solution for sinking. There may be different ways to +/// sink instructions, differing in the number of instructions sunk, +/// the number of predecessors sunk from and the number of PHIs +/// required. +struct SinkingInstructionCandidate { + unsigned NumBlocks; + unsigned NumInstructions; + unsigned NumPHIs; + unsigned NumMemoryInsts; + int Cost = -1; + SmallVector<BasicBlock *, 4> Blocks; + + void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) { + unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs; + unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0; + Cost = (NumInstructions * (NumBlocks - 1)) - + (NumExtraPHIs * + NumExtraPHIs) // PHIs are expensive, so make sure they're worth it. + - SplitEdgeCost; + } + bool operator>=(const SinkingInstructionCandidate &Other) const { + return Cost >= Other.Cost; + } +}; + +#ifndef NDEBUG +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const SinkingInstructionCandidate &C) { + OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks + << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; + return OS; +} +#endif + +//===----------------------------------------------------------------------===// + +/// Describes a PHI node that may or may not exist. These track the PHIs +/// that must be created if we sunk a sequence of instructions. It provides +/// a hash function for efficient equality comparisons. +class ModelledPHI { + SmallVector<Value *, 4> Values; + SmallVector<BasicBlock *, 4> Blocks; + +public: + ModelledPHI() {} + ModelledPHI(const PHINode *PN) { + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) + Blocks.push_back(PN->getIncomingBlock(I)); + std::sort(Blocks.begin(), Blocks.end()); + + // This assumes the PHI is already well-formed and there aren't conflicting + // incoming values for the same block. + for (auto *B : Blocks) + Values.push_back(PN->getIncomingValueForBlock(B)); + } + /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI + /// without the same ID. + /// \note This is specifically for DenseMapInfo - do not use this! + static ModelledPHI createDummy(size_t ID) { + ModelledPHI M; + M.Values.push_back(reinterpret_cast<Value*>(ID)); + return M; + } + + /// Create a PHI from an array of incoming values and incoming blocks. + template <typename VArray, typename BArray> + ModelledPHI(const VArray &V, const BArray &B) { + std::copy(V.begin(), V.end(), std::back_inserter(Values)); + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + } + + /// Create a PHI from [I[OpNum] for I in Insts]. + template <typename BArray> + ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) { + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + for (auto *I : Insts) + Values.push_back(I->getOperand(OpNum)); + } + + /// Restrict the PHI's contents down to only \c NewBlocks. + /// \c NewBlocks must be a subset of \c this->Blocks. + void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) { + auto BI = Blocks.begin(); + auto VI = Values.begin(); + while (BI != Blocks.end()) { + assert(VI != Values.end()); + if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) == + NewBlocks.end()) { + BI = Blocks.erase(BI); + VI = Values.erase(VI); + } else { + ++BI; + ++VI; + } + } + assert(Blocks.size() == NewBlocks.size()); + } + + ArrayRef<Value *> getValues() const { return Values; } + + bool areAllIncomingValuesSame() const { + return all_of(Values, [&](Value *V) { return V == Values[0]; }); + } + bool areAllIncomingValuesSameType() const { + return all_of( + Values, [&](Value *V) { return V->getType() == Values[0]->getType(); }); + } + bool areAnyIncomingValuesConstant() const { + return any_of(Values, [&](Value *V) { return isa<Constant>(V); }); + } + // Hash functor + unsigned hash() const { + return (unsigned)hash_combine_range(Values.begin(), Values.end()); + } + bool operator==(const ModelledPHI &Other) const { + return Values == Other.Values && Blocks == Other.Blocks; + } +}; + +template <typename ModelledPHI> struct DenseMapInfo { + static inline ModelledPHI &getEmptyKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(0); + return Dummy; + } + static inline ModelledPHI &getTombstoneKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(1); + return Dummy; + } + static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); } + static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) { + return LHS == RHS; + } +}; + +typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet; + +//===----------------------------------------------------------------------===// +// ValueTable +//===----------------------------------------------------------------------===// +// This is a value number table where the value number is a function of the +// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know +// that the program would be equivalent if we replaced A with PHI(A, B). +//===----------------------------------------------------------------------===// + +/// A GVN expression describing how an instruction is used. The operands +/// field of BasicExpression is used to store uses, not operands. +/// +/// This class also contains fields for discriminators used when determining +/// equivalence of instructions with sideeffects. +class InstructionUseExpr : public GVNExpression::BasicExpression { + unsigned MemoryUseOrder = -1; + bool Volatile = false; + +public: + InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R, + BumpPtrAllocator &A) + : GVNExpression::BasicExpression(I->getNumUses()) { + allocateOperands(R, A); + setOpcode(I->getOpcode()); + setType(I->getType()); + + for (auto &U : I->uses()) + op_push_back(U.getUser()); + std::sort(op_begin(), op_end()); + } + void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } + void setVolatile(bool V) { Volatile = V; } + + virtual hash_code getHashValue() const { + return hash_combine(GVNExpression::BasicExpression::getHashValue(), + MemoryUseOrder, Volatile); + } + + template <typename Function> hash_code getHashValue(Function MapFn) { + hash_code H = + hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile); + for (auto *V : operands()) + H = hash_combine(H, MapFn(V)); + return H; + } +}; + +class ValueTable { + DenseMap<Value *, uint32_t> ValueNumbering; + DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering; + DenseMap<size_t, uint32_t> HashNumbering; + BumpPtrAllocator Allocator; + ArrayRecycler<Value *> Recycler; + uint32_t nextValueNumber; + + /// Create an expression for I based on its opcode and its uses. If I + /// touches or reads memory, the expression is also based upon its memory + /// order - see \c getMemoryUseOrder(). + InstructionUseExpr *createExpr(Instruction *I) { + InstructionUseExpr *E = + new (Allocator) InstructionUseExpr(I, Recycler, Allocator); + if (isMemoryInst(I)) + E->setMemoryUseOrder(getMemoryUseOrder(I)); + + if (CmpInst *C = dyn_cast<CmpInst>(I)) { + CmpInst::Predicate Predicate = C->getPredicate(); + E->setOpcode((C->getOpcode() << 8) | Predicate); + } + return E; + } + + /// Helper to compute the value number for a memory instruction + /// (LoadInst/StoreInst), including checking the memory ordering and + /// volatility. + template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) { + if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic()) + return nullptr; + InstructionUseExpr *E = createExpr(I); + E->setVolatile(I->isVolatile()); + return E; + } + +public: + /// Returns the value number for the specified value, assigning + /// it a new number if it did not have one before. + uint32_t lookupOrAdd(Value *V) { + auto VI = ValueNumbering.find(V); + if (VI != ValueNumbering.end()) + return VI->second; + + if (!isa<Instruction>(V)) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + Instruction *I = cast<Instruction>(V); + InstructionUseExpr *exp = nullptr; + switch (I->getOpcode()) { + case Instruction::Load: + exp = createMemoryExpr(cast<LoadInst>(I)); + break; + case Instruction::Store: + exp = createMemoryExpr(cast<StoreInst>(I)); + break; + case Instruction::Call: + case Instruction::Invoke: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::ShuffleVector: + case Instruction::InsertValue: + case Instruction::GetElementPtr: + exp = createExpr(I); + break; + default: + break; + } + + if (!exp) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + uint32_t e = ExpressionNumbering[exp]; + if (!e) { + hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); }); + auto I = HashNumbering.find(H); + if (I != HashNumbering.end()) { + e = I->second; + } else { + e = nextValueNumber++; + HashNumbering[H] = e; + ExpressionNumbering[exp] = e; + } + } + ValueNumbering[V] = e; + return e; + } + + /// Returns the value number of the specified value. Fails if the value has + /// not yet been numbered. + uint32_t lookup(Value *V) const { + auto VI = ValueNumbering.find(V); + assert(VI != ValueNumbering.end() && "Value not numbered?"); + return VI->second; + } + + /// Removes all value numberings and resets the value table. + void clear() { + ValueNumbering.clear(); + ExpressionNumbering.clear(); + HashNumbering.clear(); + Recycler.clear(Allocator); + nextValueNumber = 1; + } + + ValueTable() : nextValueNumber(1) {} + + /// \c Inst uses or touches memory. Return an ID describing the memory state + /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2), + /// the exact same memory operations happen after I1 and I2. + /// + /// This is a very hard problem in general, so we use domain-specific + /// knowledge that we only ever check for equivalence between blocks sharing a + /// single immediate successor that is common, and when determining if I1 == + /// I2 we will have already determined that next(I1) == next(I2). This + /// inductive property allows us to simply return the value number of the next + /// instruction that defines memory. + uint32_t getMemoryUseOrder(Instruction *Inst) { + auto *BB = Inst->getParent(); + for (auto I = std::next(Inst->getIterator()), E = BB->end(); + I != E && !I->isTerminator(); ++I) { + if (!isMemoryInst(&*I)) + continue; + if (isa<LoadInst>(&*I)) + continue; + CallInst *CI = dyn_cast<CallInst>(&*I); + if (CI && CI->onlyReadsMemory()) + continue; + InvokeInst *II = dyn_cast<InvokeInst>(&*I); + if (II && II->onlyReadsMemory()) + continue; + return lookupOrAdd(&*I); + } + return 0; + } +}; + +//===----------------------------------------------------------------------===// + +class GVNSink { +public: + GVNSink() : VN() {} + bool run(Function &F) { + DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n"); + + unsigned NumSunk = 0; + ReversePostOrderTraversal<Function*> RPOT(&F); + for (auto *N : RPOT) + NumSunk += sinkBB(N); + + return NumSunk > 0; + } + +private: + ValueTable VN; + + bool isInstructionBlacklisted(Instruction *I) { + // These instructions may change or break semantics if moved. + if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) || + I->getType()->isTokenTy()) + return true; + return false; + } + + /// The main heuristic function. Analyze the set of instructions pointed to by + /// LRI and return a candidate solution if these instructions can be sunk, or + /// None otherwise. + Optional<SinkingInstructionCandidate> analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents); + + /// Create a ModelledPHI for each PHI in BB, adding to PHIs. + void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, + SmallPtrSetImpl<Value *> &PHIContents) { + for (auto &I : *BB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + return; + + auto MPHI = ModelledPHI(PN); + PHIs.insert(MPHI); + for (auto *V : MPHI.getValues()) + PHIContents.insert(V); + } + } + + /// The main instruction sinking driver. Set up state and try and sink + /// instructions into BBEnd from its predecessors. + unsigned sinkBB(BasicBlock *BBEnd); + + /// Perform the actual mechanics of sinking an instruction from Blocks into + /// BBEnd, which is their only successor. + void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd); + + /// Remove PHIs that all have the same incoming value. + void foldPointlessPHINodes(BasicBlock *BB) { + auto I = BB->begin(); + while (PHINode *PN = dyn_cast<PHINode>(I++)) { + if (!all_of(PN->incoming_values(), + [&](const Value *V) { return V == PN->getIncomingValue(0); })) + continue; + if (PN->getIncomingValue(0) != PN) + PN->replaceAllUsesWith(PN->getIncomingValue(0)); + else + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->eraseFromParent(); + } + } +}; + +Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) { + auto Insts = *LRI; + DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I + : Insts) { + I->dump(); + } dbgs() << " ]\n";); + + DenseMap<uint32_t, unsigned> VNums; + for (auto *I : Insts) { + uint32_t N = VN.lookupOrAdd(I); + DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n"); + if (N == ~0U) + return None; + VNums[N]++; + } + unsigned VNumToSink = + std::max_element(VNums.begin(), VNums.end(), + [](const std::pair<uint32_t, unsigned> &I, + const std::pair<uint32_t, unsigned> &J) { + return I.second < J.second; + }) + ->first; + + if (VNums[VNumToSink] == 1) + // Can't sink anything! + return None; + + // Now restrict the number of incoming blocks down to only those with + // VNumToSink. + auto &ActivePreds = LRI.getActiveBlocks(); + unsigned InitialActivePredSize = ActivePreds.size(); + SmallVector<Instruction *, 4> NewInsts; + for (auto *I : Insts) { + if (VN.lookup(I) != VNumToSink) + ActivePreds.erase(I->getParent()); + else + NewInsts.push_back(I); + } + for (auto *I : NewInsts) + if (isInstructionBlacklisted(I)) + return None; + + // If we've restricted the incoming blocks, restrict all needed PHIs also + // to that set. + bool RecomputePHIContents = false; + if (ActivePreds.size() != InitialActivePredSize) { + ModelledPHISet NewNeededPHIs; + for (auto P : NeededPHIs) { + P.restrictToBlocks(ActivePreds); + NewNeededPHIs.insert(P); + } + NeededPHIs = NewNeededPHIs; + LRI.restrictToBlocks(ActivePreds); + RecomputePHIContents = true; + } + + // The sunk instruction's results. + ModelledPHI NewPHI(NewInsts, ActivePreds); + + // Does sinking this instruction render previous PHIs redundant? + if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) { + NeededPHIs.erase(NewPHI); + RecomputePHIContents = true; + } + + if (RecomputePHIContents) { + // The needed PHIs have changed, so recompute the set of all needed + // values. + PHIContents.clear(); + for (auto &PHI : NeededPHIs) + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + // Is this instruction required by a later PHI that doesn't match this PHI? + // if so, we can't sink this instruction. + for (auto *V : NewPHI.getValues()) + if (PHIContents.count(V)) + // V exists in this PHI, but the whole PHI is different to NewPHI + // (else it would have been removed earlier). We cannot continue + // because this isn't representable. + return None; + + // Which operands need PHIs? + // FIXME: If any of these fail, we should partition up the candidates to + // try and continue making progress. + Instruction *I0 = NewInsts[0]; + for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) { + ModelledPHI PHI(NewInsts, OpNum, ActivePreds); + if (PHI.areAllIncomingValuesSame()) + continue; + if (!canReplaceOperandWithVariable(I0, OpNum)) + // We can 't create a PHI from this instruction! + return None; + if (NeededPHIs.count(PHI)) + continue; + if (!PHI.areAllIncomingValuesSameType()) + return None; + // Don't create indirect calls! The called value is the final operand. + if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 && + PHI.areAnyIncomingValuesConstant()) + return None; + + NeededPHIs.reserve(NeededPHIs.size()); + NeededPHIs.insert(PHI); + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + if (isMemoryInst(NewInsts[0])) + ++MemoryInstNum; + + SinkingInstructionCandidate Cand; + Cand.NumInstructions = ++InstNum; + Cand.NumMemoryInsts = MemoryInstNum; + Cand.NumBlocks = ActivePreds.size(); + Cand.NumPHIs = NeededPHIs.size(); + for (auto *C : ActivePreds) + Cand.Blocks.push_back(C); + + return Cand; +} + +unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { + DEBUG(dbgs() << "GVNSink: running on basic block "; + BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); + SmallVector<BasicBlock *, 4> Preds; + for (auto *B : predecessors(BBEnd)) { + auto *T = B->getTerminator(); + if (isa<BranchInst>(T) || isa<SwitchInst>(T)) + Preds.push_back(B); + else + return 0; + } + if (Preds.size() < 2) + return 0; + std::sort(Preds.begin(), Preds.end()); + + unsigned NumOrigPreds = Preds.size(); + // We can only sink instructions through unconditional branches. + for (auto I = Preds.begin(); I != Preds.end();) { + if ((*I)->getTerminator()->getNumSuccessors() != 1) + I = Preds.erase(I); + else + ++I; + } + + LockstepReverseIterator LRI(Preds); + SmallVector<SinkingInstructionCandidate, 4> Candidates; + unsigned InstNum = 0, MemoryInstNum = 0; + ModelledPHISet NeededPHIs; + SmallPtrSet<Value *, 4> PHIContents; + analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents); + unsigned NumOrigPHIs = NeededPHIs.size(); + + while (LRI.isValid()) { + auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum, + NeededPHIs, PHIContents); + if (!Cand) + break; + Cand->calculateCost(NumOrigPHIs, Preds.size()); + Candidates.emplace_back(*Cand); + --LRI; + } + + std::stable_sort( + Candidates.begin(), Candidates.end(), + [](const SinkingInstructionCandidate &A, + const SinkingInstructionCandidate &B) { return A >= B; }); + DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C + : Candidates) dbgs() + << " " << C << "\n";); + + // Pick the top candidate, as long it is positive! + if (Candidates.empty() || Candidates.front().Cost <= 0) + return 0; + auto C = Candidates.front(); + + DEBUG(dbgs() << " -- Sinking: " << C << "\n"); + BasicBlock *InsertBB = BBEnd; + if (C.Blocks.size() < NumOrigPreds) { + DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs()); + dbgs() << "\n"); + InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split"); + if (!InsertBB) { + DEBUG(dbgs() << " -- FAILED to split edge!\n"); + // Edge couldn't be split. + return 0; + } + } + + for (unsigned I = 0; I < C.NumInstructions; ++I) + sinkLastInstruction(C.Blocks, InsertBB); + + return C.NumInstructions; +} + +void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, + BasicBlock *BBEnd) { + SmallVector<Instruction *, 4> Insts; + for (BasicBlock *BB : Blocks) + Insts.push_back(BB->getTerminator()->getPrevNode()); + Instruction *I0 = Insts.front(); + + SmallVector<Value *, 4> NewOperands; + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { + bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) { + return I->getOperand(O) != I0->getOperand(O); + }); + if (!NeedPHI) { + NewOperands.push_back(I0->getOperand(O)); + continue; + } + + // Create a new PHI in the successor block and populate it. + auto *Op = I0->getOperand(O); + assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!"); + auto *PN = PHINode::Create(Op->getType(), Insts.size(), + Op->getName() + ".sink", &BBEnd->front()); + for (auto *I : Insts) + PN->addIncoming(I->getOperand(O), I->getParent()); + NewOperands.push_back(PN); + } + + // Arbitrarily use I0 as the new "common" instruction; remap its operands + // and move it to the start of the successor block. + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) + I0->getOperandUse(O).set(NewOperands[O]); + I0->moveBefore(&*BBEnd->getFirstInsertionPt()); + + // Update metadata and IR flags. + for (auto *I : Insts) + if (I != I0) { + combineMetadataForCSE(I0, I); + I0->andIRFlags(I); + } + + for (auto *I : Insts) + if (I != I0) + I->replaceAllUsesWith(I0); + foldPointlessPHINodes(BBEnd); + + // Finally nuke all instructions apart from the common instruction. + for (auto *I : Insts) + if (I != I0) + I->eraseFromParent(); + + NumRemoved += Insts.size() - 1; +} + +//////////////////////////////////////////////////////////////////////////////// +// Pass machinery / boilerplate + +class GVNSinkLegacyPass : public FunctionPass { +public: + static char ID; + + GVNSinkLegacyPass() : FunctionPass(ID) { + initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + GVNSink G; + return G.run(F); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} // namespace + +PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { + GVNSink G; + if (!G.run(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; +} + +char GVNSinkLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) + +FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); } diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp index 198d2b2b024f..65a2cd955672 100644 --- a/lib/Transforms/Scalar/GuardWidening.cpp +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -537,9 +537,7 @@ bool GuardWideningImpl::parseRangeChecks( Changed = true; } else if (match(Check.getBase(), m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { - unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(OpLHS, Known, DL); + KnownBits Known = computeKnownBits(OpLHS, DL); if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) { Check.setBase(OpLHS); APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 85db6e5e1105..e21b0feb7c5a 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1228,7 +1228,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, ValueToValueMapTy &VM) { - Loop &New = LPM.addLoop(Parent); + Loop &New = *new Loop(); + if (Parent) + Parent->addChildLoop(&New); + else + LI.addTopLevelLoop(&New); + LPM.addLoop(New); // Add all of the blocks in Original to the new loop. for (auto *BB : Original->blocks()) diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index ada22ae38eb8..2ef8f8563bb9 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -253,6 +253,35 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, return EverChanged; } +// Replace uses of Cond with ToVal when safe to do so. If all uses are +// replaced, we can remove Cond. We cannot blindly replace all uses of Cond +// because we may incorrectly replace uses when guards/assumes are uses of +// of `Cond` and we used the guards/assume to reason about the `Cond` value +// at the end of block. RAUW unconditionally replaces all uses +// including the guards/assumes themselves and the uses before the +// guard/assume. +static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) { + assert(Cond->getType() == ToVal->getType()); + auto *BB = Cond->getParent(); + // We can unconditionally replace all uses in non-local blocks (i.e. uses + // strictly dominated by BB), since LVI information is true from the + // terminator of BB. + replaceNonLocalUsesWith(Cond, ToVal); + for (Instruction &I : reverse(*BB)) { + // Reached the Cond whose uses we are trying to replace, so there are no + // more uses. + if (&I == Cond) + break; + // We only replace uses in instructions that are guaranteed to reach the end + // of BB, where we know Cond is ToVal. + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + I.replaceUsesOfWith(Cond, ToVal); + } + if (Cond->use_empty() && !Cond->mayHaveSideEffects()) + Cond->eraseFromParent(); +} + /// Return the cost of duplicating a piece of this block from first non-phi /// and before StopAt instruction to thread across it. Stop scanning the block /// when exceeding the threshold. If duplication is impossible, returns ~0U. @@ -833,13 +862,19 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { CondBr->eraseFromParent(); if (CondCmp->use_empty()) CondCmp->eraseFromParent(); - // TODO: We can safely replace *some* uses of the CondInst if it has + // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. + else if (CondCmp->getParent() == BB) { + auto *CI = Ret == LazyValueInfo::True ? + ConstantInt::getTrue(CondCmp->getType()) : + ConstantInt::getFalse(CondCmp->getType()); + ReplaceFoldableUses(CondCmp, CI); + } return true; } @@ -1325,13 +1360,16 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, if (auto *CondInst = dyn_cast<Instruction>(Cond)) { if (CondInst->use_empty() && !CondInst->mayHaveSideEffects()) CondInst->eraseFromParent(); - // TODO: We can safely replace *some* uses of the CondInst if it has + // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. + else if (OnlyVal && OnlyVal != MultipleVal && + CondInst->getParent() == BB) + ReplaceFoldableUses(CondInst, OnlyVal); } return true; } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 97337ea5ba62..c6a05ecbd0b1 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1035,6 +1035,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { return nullptr; } +// Check if the recurrence variable `VarX` is in the right form to create +// the idiom. Returns the value coerced to a PHINode if so. +static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, + BasicBlock *LoopEntry) { + auto *PhiX = dyn_cast<PHINode>(VarX); + if (PhiX && PhiX->getParent() == LoopEntry && + (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) + return PhiX; + return nullptr; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1110,13 +1121,9 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, } // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast<PHINode>(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { - return false; - } - } + PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); + if (!PhiX) + return false; // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { @@ -1132,8 +1139,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) continue; // Check if the result of the instruction is live of the loop. @@ -1227,8 +1234,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X - PhiX = dyn_cast<PHINode>(VarX); - if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX)) + PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); + if (!PhiX) return false; // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 @@ -1248,8 +1255,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) continue; CntInst = Inst; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 6ef1464e9338..19daebd0613a 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -831,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop &New = LPM->addLoop(PL); + Loop &New = *new Loop(); + if (PL) + PL->addChildLoop(&New); + else + LI->addTopLevelLoop(&New); + LPM->addLoop(New); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 5cfbf6baeaa9..67abc3116988 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -858,7 +858,14 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, // Filter out unreachable phi operands. auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { - return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}); + if (*U == PN) + return false; + if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock})) + return false; + // Things in TOPClass are equivalent to everything. + if (ValueToClass.lookup(*U) == TOPClass) + return false; + return true; }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), [&](const Use *U) -> Value * { @@ -866,14 +873,6 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(*U); - // Use nullptr to distinguish between things that were - // originally self-defined and those that have an operand - // leader that is self-defined. - if (*U == PN) - return nullptr; - // Things in TOPClass are equivalent to everything. - if (ValueToClass.lookup(*U) == TOPClass) - return nullptr; return lookupOperandLeader(*U); }); return E; @@ -955,6 +954,10 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, CongruenceClass *CC = ValueToClass.lookup(V); if (CC && CC->getDefiningExpr()) { + // If we simplified to something else, we need to communicate + // that we're users of the value we simplified to. + if (I != V) + addAdditionalUsers(V, I); if (I) DEBUG(dbgs() << "Simplified " << *I << " to " << " expression " << *CC->getDefiningExpr() << "\n"); @@ -1581,6 +1584,30 @@ bool NewGVN::isCycleFree(const Instruction *I) const { // Evaluate PHI nodes symbolically, and create an expression result. const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { + // Resolve irreducible and reducible phi cycles. + // FIXME: This is hopefully a temporary solution while we resolve the issues + // with fixpointing self-cycles. It currently should be "guaranteed" to be + // correct, but non-optimal. The SCCFinder does not, for example, take + // reachability of arguments into account, etc. + SCCFinder.Start(I); + bool CanOptimize = true; + SmallPtrSet<Value *, 8> OuterOps; + + auto &Component = SCCFinder.getComponentFor(I); + for (auto *Member : Component) { + if (!isa<PHINode>(Member)) { + CanOptimize = false; + break; + } + for (auto &PHIOp : cast<PHINode>(Member)->operands()) + if (!isa<PHINode>(PHIOp) || !Component.count(cast<PHINode>(PHIOp))) + OuterOps.insert(PHIOp); + } + if (CanOptimize && OuterOps.size() == 1) { + DEBUG(dbgs() << "Resolving cyclic phi to value " << *(*OuterOps.begin()) + << "\n"); + return createVariableOrConstant(*OuterOps.begin()); + } // True if one of the incoming phi edges is a backedge. bool HasBackedge = false; // All constant tracks the state of whether all the *original* phi operands @@ -1594,17 +1621,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // See if all arguments are the same. // We track if any were undef because they need special handling. bool HasUndef = false; - bool CycleFree = isCycleFree(I); auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { - if (Arg == nullptr) - return false; - // Original self-operands are already eliminated during expression creation. - // We can only eliminate value-wise self-operands if it's cycle - // free. Otherwise, eliminating the operand can cause our value to change, - // which can cause us to not eliminate the operand, which changes the value - // back to what it was before, cycling forever. - if (CycleFree && Arg == I) - return false; if (isa<UndefValue>(Arg)) { HasUndef = true; return false; @@ -1613,6 +1630,14 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { }); // If we are left with no operands, it's dead. if (Filtered.begin() == Filtered.end()) { + // If it has undef at this point, it means there are no-non-undef arguments, + // and thus, the value of the phi node must be undef. + if (HasUndef) { + DEBUG(dbgs() << "PHI Node " << *I + << " has no non-undef arguments, valuing it as undef\n"); + return createConstantExpression(UndefValue::get(I->getType())); + } + DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); deleteExpression(E); return createDeadExpression(); @@ -1642,7 +1667,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // constants, or all operands are ignored but the undef, it also must be // cycle free. if (!AllConstant && HasBackedge && NumOps > 0 && - !isa<UndefValue>(AllSameValue) && !CycleFree) + !isa<UndefValue>(AllSameValue) && !isCycleFree(I)) return E; // Only have to check for instructions @@ -3556,6 +3581,7 @@ bool NewGVN::eliminateInstructions(Function &F) { // Map to store the use counts DenseMap<const Value *, unsigned int> UseCounts; for (auto *CC : reverse(CongruenceClasses)) { + DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n"); // Track the equivalent store info so we can decide whether to try // dead store elimination. SmallVector<ValueDFS, 8> PossibleDeadStores; @@ -3602,8 +3628,6 @@ bool NewGVN::eliminateInstructions(Function &F) { } CC->swap(MembersLeft); } else { - DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() - << "\n"); // If this is a singleton, we can skip it. if (CC->size() != 1 || RealToTemp.lookup(Leader)) { // This is a stack because equality replacement/etc may place @@ -3846,6 +3870,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } +namespace { class NewGVNLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. @@ -3865,6 +3890,7 @@ private: AU.addPreserved<GlobalsAAWrapperPass>(); } }; +} // namespace bool NewGVNLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 8908dae2f545..1d0e8396f6a2 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1779,8 +1779,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F.hasLocalLinkage()) { - if (AddressIsTaken(&F)) + if (F.hasAddressTaken()) { AddressTakenFunctions.insert(&F); + } else { Solver.AddArgumentTrackedFunction(&F); continue; diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 24bd0a2b7bdf..6e113bccff94 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -326,7 +326,7 @@ private: /// partition. uint64_t BeginOffset, EndOffset; - /// \brief The start end end iterators of this partition. + /// \brief The start and end iterators of this partition. iterator SI, SJ; /// \brief A collection of split slice tails overlapping the partition. diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 52201d8f3e51..9fa43da99da9 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); + initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index b32a61a7e8f8..0f170e26ce5f 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -123,11 +123,62 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, // exit block. DT.changeImmediateDominator(UnswitchedNode, OldPHNode); - // Blocks reachable from the unswitched block may need to change their IDom - // as well. + // For everything that moves up the dominator tree, we need to examine the + // dominator frontier to see if it additionally should move up the dominator + // tree. This lambda appends the dominator frontier for a node on the + // worklist. + // + // Note that we don't currently use the IDFCalculator here for two reasons: + // 1) It computes dominator tree levels for the entire function on each run + // of 'compute'. While this isn't terrible, given that we expect to update + // relatively small subtrees of the domtree, it isn't necessarily the right + // tradeoff. + // 2) The interface doesn't fit this usage well. It doesn't operate in + // append-only, and builds several sets that we don't need. + // + // FIXME: Neither of these issues are a big deal and could be addressed with + // some amount of refactoring of IDFCalculator. That would allow us to share + // the core logic here (which is solving the same core problem). SmallSetVector<BasicBlock *, 4> Worklist; - for (auto *SuccBB : successors(UnswitchedBB)) - Worklist.insert(SuccBB); + SmallVector<DomTreeNode *, 4> DomNodes; + SmallPtrSet<BasicBlock *, 4> DomSet; + auto AppendDomFrontier = [&](DomTreeNode *Node) { + assert(DomNodes.empty() && "Must start with no dominator nodes."); + assert(DomSet.empty() && "Must start with an empty dominator set."); + + // First flatten this subtree into sequence of nodes by doing a pre-order + // walk. + DomNodes.push_back(Node); + // We intentionally re-evaluate the size as each node can add new children. + // Because this is a tree walk, this cannot add any duplicates. + for (int i = 0; i < (int)DomNodes.size(); ++i) + DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); + + // Now create a set of the basic blocks so we can quickly test for + // dominated successors. We could in theory use the DFS numbers of the + // dominator tree for this, but we want this to remain predictably fast + // even while we mutate the dominator tree in ways that would invalidate + // the DFS numbering. + for (DomTreeNode *InnerN : DomNodes) + DomSet.insert(InnerN->getBlock()); + + // Now re-walk the nodes, appending every successor of every node that isn't + // in the set. Note that we don't append the node itself, even though if it + // is a successor it does not strictly dominate itself and thus it would be + // part of the dominance frontier. The reason we don't append it is that + // the node passed in came *from* the worklist and so it has already been + // processed. + for (DomTreeNode *InnerN : DomNodes) + for (BasicBlock *SuccBB : successors(InnerN->getBlock())) + if (!DomSet.count(SuccBB)) + Worklist.insert(SuccBB); + + DomNodes.clear(); + DomSet.clear(); + }; + + // Append the initial dom frontier nodes. + AppendDomFrontier(UnswitchedNode); // Walk the worklist. We grow the list in the loop and so must recompute size. for (int i = 0; i < (int)Worklist.size(); ++i) { @@ -136,20 +187,17 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, DomTreeNode *Node = DT[BB]; assert(!DomChain.count(Node) && "Cannot be dominated by a block you can reach!"); - // If this block doesn't have an immediate dominator somewhere in the chain - // we hoisted over, then its position in the domtree hasn't changed. Either - // it is above the region hoisted and still valid, or it is below the - // hoisted block and so was trivially updated. This also applies to - // everything reachable from this block so we're completely done with the - // it. + + // If this block had an immediate dominator somewhere in the chain + // we hoisted over, then its position in the domtree needs to move as it is + // reachable from a node hoisted over this chain. if (!DomChain.count(Node->getIDom())) continue; - // We need to change the IDom for this node but also walk its successors - // which could have similar dominance position. DT.changeImmediateDominator(Node, OldPHNode); - for (auto *SuccBB : successors(BB)) - Worklist.insert(SuccBB); + + // Now add this node's dominator frontier to the worklist as well. + AppendDomFrontier(Node); } } diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index bf2ab7c55be2..1ec3d0d49637 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -133,7 +133,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, auto *SP = cast<DISubprogram>(MD.second); NewMD = DISubprogram::getDistinct( NewFunc->getContext(), SP->getScope(), SP->getName(), - NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(), + SP->getLinkageName(), SP->getFile(), SP->getLine(), SP->getType(), SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp index 73a0b2737e95..57468be9a2a8 100644 --- a/lib/Transforms/Utils/FunctionComparator.cpp +++ b/lib/Transforms/Utils/FunctionComparator.cpp @@ -76,12 +76,14 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const { int FunctionComparator::cmpAttrs(const AttributeList L, const AttributeList R) const { - if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots())) + if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets())) return Res; - for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) { - AttributeList::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i), - RE = R.end(i); + for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) { + AttributeSet LAS = L.getAttributes(i); + AttributeSet RAS = R.getAttributes(i); + AttributeSet::iterator LI = LAS.begin(), LE = LAS.end(); + AttributeSet::iterator RI = RAS.begin(), RE = RAS.end(); for (; LI != LE && RI != RE; ++LI, ++RI) { Attribute LA = *LI; Attribute RA = *RI; diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 9cb4762b683c..0ca9f4c484e6 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1397,11 +1397,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, const Optional<uint64_t> &CalleeEntryCount, const Instruction *TheCall, - ProfileSummaryInfo *PSI) { + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *CallerBFI) { if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1) return; Optional<uint64_t> CallSiteCount = - PSI ? PSI->getProfileCount(TheCall, nullptr) : None; + PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None; uint64_t CallCount = std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0, CalleeEntryCount.getValue()); @@ -1637,7 +1638,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CalledFunc->front()); updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, - IFI.PSI); + IFI.PSI, IFI.CallerBFI); // Update the profile count of callee. updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 1ca509472b5f..ebd528bc8ec1 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1037,17 +1037,15 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); - unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType()); - KnownBits Known(BitWidth); - computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); + KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT); unsigned TrailZ = Known.countMinTrailingZeros(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); - unsigned Align = 1u << std::min(BitWidth - 1, TrailZ); + unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ); // LLVM doesn't support alignments larger than this currently. Align = std::min(Align, +Value::MaximumAlignment); @@ -1796,6 +1794,23 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To, return Count; } +unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) { + assert(From->getType() == To->getType()); + auto *BB = From->getParent(); + unsigned Count = 0; + + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE;) { + Use &U = *UI++; + auto *I = cast<Instruction>(U.getUser()); + if (I->getParent() == BB) + continue; + U.set(To); + ++Count; + } + return Count; +} + unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root) { @@ -2094,3 +2109,48 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin( !F->doesNotAccessMemory()) CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin); } + +bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { + // We can't have a PHI with a metadata type. + if (I->getOperand(OpIdx)->getType()->isMetadataTy()) + return false; + + // Early exit. + if (!isa<Constant>(I->getOperand(OpIdx))) + return true; + + switch (I->getOpcode()) { + default: + return true; + case Instruction::Call: + case Instruction::Invoke: + // Many arithmetic intrinsics have no issue taking a + // variable, however it's hard to distingish these from + // specials such as @llvm.frameaddress that require a constant. + if (isa<IntrinsicInst>(I)) + return false; + + // Constant bundle operands may need to retain their constant-ness for + // correctness. + if (ImmutableCallSite(I).isBundleOperand(OpIdx)) + return false; + return true; + case Instruction::ShuffleVector: + // Shufflevector masks are constant. + return OpIdx != 2; + case Instruction::ExtractValue: + case Instruction::InsertValue: + // All operands apart from the first are constant. + return OpIdx == 0; + case Instruction::Alloca: + return false; + case Instruction::GetElementPtr: + if (OpIdx == 0) + return true; + gep_type_iterator It = gep_type_begin(I); + for (auto E = std::next(It, OpIdx); It != E; ++It) + if (It.isStruct()) + return false; + return true; + } +} diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 27f72fcd8bda..1b442a9a264d 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1376,53 +1376,6 @@ HoistTerminator: return true; } -// Is it legal to place a variable in operand \c OpIdx of \c I? -// FIXME: This should be promoted to Instruction. -static bool canReplaceOperandWithVariable(const Instruction *I, - unsigned OpIdx) { - // We can't have a PHI with a metadata type. - if (I->getOperand(OpIdx)->getType()->isMetadataTy()) - return false; - - // Early exit. - if (!isa<Constant>(I->getOperand(OpIdx))) - return true; - - switch (I->getOpcode()) { - default: - return true; - case Instruction::Call: - case Instruction::Invoke: - // FIXME: many arithmetic intrinsics have no issue taking a - // variable, however it's hard to distingish these from - // specials such as @llvm.frameaddress that require a constant. - if (isa<IntrinsicInst>(I)) - return false; - - // Constant bundle operands may need to retain their constant-ness for - // correctness. - if (ImmutableCallSite(I).isBundleOperand(OpIdx)) - return false; - - return true; - - case Instruction::ShuffleVector: - // Shufflevector masks are constant. - return OpIdx != 2; - case Instruction::ExtractValue: - case Instruction::InsertValue: - // All operands apart from the first are constant. - return OpIdx == 0; - case Instruction::Alloca: - return false; - case Instruction::GetElementPtr: - if (OpIdx == 0) - return true; - gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1); - return It.isSequential(); - } -} - // All instructions in Insts belong to different blocks that all unconditionally // branch to a common successor. Analyze each instruction and return true if it // would be possible to sink them into their successor, creating one common @@ -4368,8 +4321,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, const DataLayout &DL) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); - KnownBits Known(Bits); - computeKnownBits(Cond, Known, DL, 0, AC, SI); + KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 85c9464b5569..49effda5d833 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -466,9 +466,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B, } Value *Offset = GEP->getOperand(2); - unsigned BitWidth = Offset->getType()->getIntegerBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Offset, Known, DL, 0, nullptr, CI, nullptr); + KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr); Known.Zero.flipAllBits(); uint64_t ArrSize = cast<ArrayType>(GEP->getSourceElementType())->getNumElements(); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1dc554bede7e..3b036a6ac430 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2092,6 +2092,10 @@ private: /// The data is collected per VF. DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + /// Holds the instructions (address computations) that are forced to be + /// scalarized. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5086,12 +5090,18 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } bool LoopVectorizationLegality::canVectorize() { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // FIXME: The code is currently dead, since the loop gets sent to @@ -5101,21 +5111,30 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->empty()) { ORE->emit(createMissedAnalysis("NotInnermostLoop") << "loop is not the innermost loop"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We only handle bottom-tested loops, i.e. loop in which the condition is @@ -5124,7 +5143,10 @@ bool LoopVectorizationLegality::canVectorize() { if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We need to have a loop header. @@ -5135,28 +5157,28 @@ bool LoopVectorizationLegality::canVectorize() { unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - return false; - } - - // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = PSE.getBackedgeTakenCount(); - if (ExitCount == PSE.getSE()->getCouldNotCompute()) { - ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations") - << "could not determine number of loop iterations"); - DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // Go over each instruction and look at memory deps. if (!canVectorizeMemory()) { DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } DEBUG(dbgs() << "LV: We can vectorize this loop" @@ -5184,13 +5206,17 @@ bool LoopVectorizationLegality::canVectorize() { << "Too many SCEV assumptions need to be made and checked " << "at runtime"); DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } - // Okay! We can vectorize. At this point we don't have any other mem analysis + // Okay! We've done all the tests. If any have failed, return false. Otherwise + // we can vectorize, and at this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. - return true; + return Result; } static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { @@ -5554,6 +5580,13 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } + // Insert the forced scalars. + // FIXME: Currently widenPHIInstruction() often creates a dead vector + // induction variable when the PHI user is scalarized. + if (ForcedScalars.count(VF)) + for (auto *I : ForcedScalars.find(VF)->second) + Worklist.insert(I); + // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the // expansion step in collectLoopUniforms(); however, here we're only @@ -7129,11 +7162,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (VF > 1 && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); + // Forced scalars do not have any scalarization overhead. + if (VF > 1 && ForcedScalars.count(VF) && + ForcedScalars.find(VF)->second.count(I)) + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); + // Note: Even if all instructions are scalarized, return true if any memory + // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = - VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); } @@ -7208,6 +7248,62 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { setWideningDecision(&I, VF, Decision, Cost); } } + + // Make sure that any load of address and any other address computation + // remains scalar unless there is gather/scatter support. This avoids + // inevitable extracts into address registers, and also has the benefit of + // activating LSR more, since that pass can't optimize vectorized + // addresses. + if (TTI.prefersVectorizedAddressing()) + return; + + // Start with all scalar pointer uses. + SmallPtrSet<Instruction *, 8> AddrDefs; + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + Instruction *PtrDef = + dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + if (PtrDef && TheLoop->contains(PtrDef) && + getWideningDecision(&I, VF) != CM_GatherScatter) + AddrDefs.insert(PtrDef); + } + + // Add all instructions used to generate the addresses. + SmallVector<Instruction *, 4> Worklist; + for (auto *I : AddrDefs) + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (auto &Op : I->operands()) + if (auto *InstOp = dyn_cast<Instruction>(Op)) + if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && + AddrDefs.insert(InstOp).second == true) + Worklist.push_back(InstOp); + } + + for (auto *I : AddrDefs) { + if (isa<LoadInst>(I)) { + // Setting the desired widening decision should ideally be handled in + // by cost functions, but since this involves the task of finding out + // if the loaded register is involved in an address computation, it is + // instead changed here when we know this is the case. + if (getWideningDecision(I, VF) == CM_Widen) + // Scalarize a widened load of address. + setWideningDecision(I, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(I, 1))); + else if (auto Group = Legal->getInterleavedAccessGroup(I)) { + // Scalarize an interleave group of address loads. + for (unsigned I = 0; I < Group->getFactor(); ++I) { + if (Instruction *Member = Group->getMember(I)) + setWideningDecision(Member, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(Member, 1))); + } + } + } else + // Make sure I gets scalarized and a cost estimate without + // scalarization overhead. + ForcedScalars[VF].insert(I); + } } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, @@ -7216,7 +7312,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. @@ -7349,9 +7445,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - SmallVector<const Value *, 4> Operands(I->operand_values()); - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + SmallVector<const Value *, 4> Operands(I->operand_values()); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -7374,7 +7471,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - VectorTy = ToVectorTy(getMemInstValueType(I), VF); + unsigned Width = VF; + if (Width > 1) { + InstWidening Decision = getWideningDecision(I, Width); + assert(Decision != CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == CM_Scalarize) + Width = 1; + } + VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: |