diff options
Diffstat (limited to 'lib/Transforms')
35 files changed, 1167 insertions, 740 deletions
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 4bc64ab698ff..087a8aa2c624 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -72,10 +72,6 @@ static cl::opt<bool> RunLoopRerolling("reroll-loops", cl::Hidden, cl::desc("Run the loop rerolling pass")); -static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false), - cl::Hidden, - cl::desc("Run the load combining pass")); - static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass")); @@ -174,7 +170,6 @@ PassManagerBuilder::PassManagerBuilder() { SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; RerollLoops = RunLoopRerolling; - LoadCombine = RunLoadCombine; NewGVN = RunNewGVN; DisableGVNLoadPRE = false; VerifyInput = false; @@ -296,6 +291,8 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { InstrProfOptions Options; if (!PGOInstrGen.empty()) Options.InstrProfileOutput = PGOInstrGen; + Options.DoCounterPromotion = true; + MPM.add(createLoopRotatePass()); MPM.add(createInstrProfilingLegacyPass(Options)); } if (!PGOInstrUse.empty()) @@ -407,9 +404,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses( } } - if (LoadCombine) - MPM.add(createLoadCombinePass()); - MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Clean up after everything. @@ -850,9 +844,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // alignments. PM.add(createAlignmentFromAssumptionsPass()); - if (LoadCombine) - PM.add(createLoadCombinePass()); - // Cleanup and simplify the code after the scalar optimizations. addInstructionCombiningPass(PM); addExtensionsToPM(EP_Peephole, PM); diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 67bc8f5f6b7a..656421ee58df 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -690,6 +690,9 @@ bool SampleProfileLoader::inlineHotFunctions( for (auto I : CIS) { InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr); Function *CalledFunction = CallSite(I).getCalledFunction(); + // Do not inline recursive calls. + if (CalledFunction == &F) + continue; Instruction *DI = I; if (!CalledFunction && !PromotedInsns.count(I) && CallSite(I).isIndirectCall()) diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 287a5167fe2a..d5f0dd191415 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -988,15 +988,24 @@ static Instruction *foldAddWithConstant(BinaryOperator &Add, return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); } - // Shifts and add used to flip and mask off the low bit: - // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 - const APInt *C3; - if (C->isOneValue() && - match(Op0, - m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3)))) && - C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { - Value *NotX = Builder.CreateNot(X); - return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); + if (C->isOneValue() && Op0->hasOneUse()) { + // add (sext i1 X), 1 --> zext (not X) + // TODO: The smallest IR representation is (select X, 0, 1), and that would + // not require the one-use check. But we need to remove a transform in + // visitSelect and make sure that IR value tracking for select is equal or + // better than for these ops. + if (match(Op0, m_SExt(m_Value(X))) && + X->getType()->getScalarSizeInBits() == 1) + return new ZExtInst(Builder.CreateNot(X), Ty); + + // Shifts and add used to flip and mask off the low bit: + // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 + const APInt *C3; + if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) && + C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { + Value *NotX = Builder.CreateNot(X); + return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); + } } return nullptr; diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index a881bda5ba98..d3d8cefe9735 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1097,20 +1097,11 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast, Type *DestTy = Logic.getType(); Type *SrcTy = Cast->getSrcTy(); - // If the first operand is bitcast, move the logic operation ahead of the - // bitcast (do the logic operation in the original type). This can eliminate - // bitcasts and allow combines that would otherwise be impeded by the bitcast. + // Move the logic operation ahead of a zext if the constant is unchanged in + // the smaller source type. Performing the logic in a smaller type may provide + // more information to later folds, and the smaller logic instruction may be + // cheaper (particularly in the case of vectors). Value *X; - if (match(Cast, m_BitCast(m_Value(X)))) { - Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy); - Value *NewOp = Builder->CreateBinOp(LogicOpc, X, NewConstant); - return CastInst::CreateBitOrPointerCast(NewOp, DestTy); - } - - // Similarly, move the logic operation ahead of a zext if the constant is - // unchanged in the smaller source type. Performing the logic in a smaller - // type may provide more information to later folds, and the smaller logic - // instruction may be cheaper (particularly in the case of vectors). if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) { Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy); Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy); @@ -1239,9 +1230,10 @@ static Instruction *foldAndToXor(BinaryOperator &I, // (A | ~B) & (B | ~A) --> ~(A ^ B) // (~B | A) & (~A | B) --> ~(A ^ B) // (~B | A) & (B | ~A) --> ~(A ^ B) - if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) - return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + if (Op0->hasOneUse() || Op1->hasOneUse()) + if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); return nullptr; } @@ -1256,9 +1248,10 @@ static Instruction *foldOrToXor(BinaryOperator &I, // Operand complexity canonicalization guarantees that the 'and' is Op0. // (A & B) | ~(A | B) --> ~(A ^ B) // (A & B) | ~(B | A) --> ~(A ^ B) - if (match(Op0, m_And(m_Value(A), m_Value(B))) && - match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) - return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + if (Op0->hasOneUse() || Op1->hasOneUse()) + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); // (A & ~B) | (~A & B) --> A ^ B // (A & ~B) | (B & ~A) --> A ^ B @@ -1442,13 +1435,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) - if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse()) + if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C)); // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) - if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse()) + if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C)); // (A | B) & ((~A) ^ B) -> (A & B) @@ -1579,11 +1572,14 @@ static Value *getSelectCondition(Value *A, Value *B, // If A and B are sign-extended, look through the sexts to find the booleans. Value *Cond; + Value *NotB; if (match(A, m_SExt(m_Value(Cond))) && Cond->getType()->getScalarType()->isIntegerTy(1) && - match(B, m_CombineOr(m_Not(m_SExt(m_Specific(Cond))), - m_SExt(m_Not(m_Specific(Cond)))))) - return Cond; + match(B, m_OneUse(m_Not(m_Value(NotB))))) { + NotB = peekThroughBitcast(NotB, true); + if (match(NotB, m_SExt(m_Specific(Cond)))) + return Cond; + } // All scalar (and most vector) possibilities should be handled now. // Try more matches that only apply to non-splat constant vectors. @@ -1615,12 +1611,8 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D, // The potential condition of the select may be bitcasted. In that case, look // through its bitcast and the corresponding bitcast of the 'not' condition. Type *OrigType = A->getType(); - Value *SrcA, *SrcB; - if (match(A, m_OneUse(m_BitCast(m_Value(SrcA)))) && - match(B, m_OneUse(m_BitCast(m_Value(SrcB))))) { - A = SrcA; - B = SrcB; - } + A = peekThroughBitcast(A, true); + B = peekThroughBitcast(B, true); if (Value *Cond = getSelectCondition(A, B, Builder)) { // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D)) @@ -1922,8 +1914,9 @@ Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { /// (A & C1) | B /// /// when the XOR of the two constants is "all ones" (-1). -Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, - Value *A, Value *B, Value *C) { +static Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, + Value *A, Value *B, Value *C, + InstCombiner::BuilderTy *Builder) { ConstantInt *CI1 = dyn_cast<ConstantInt>(C); if (!CI1) return nullptr; @@ -1944,15 +1937,16 @@ Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, /// \brief This helper function folds: /// -/// ((A | B) & C1) ^ (B & C2) +/// ((A ^ B) & C1) | (B & C2) /// /// into: /// /// (A & C1) ^ B /// /// when the XOR of the two constants is "all ones" (-1). -Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op, - Value *A, Value *B, Value *C) { +static Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, + Value *A, Value *B, Value *C, + InstCombiner::BuilderTy *Builder) { ConstantInt *CI1 = dyn_cast<ConstantInt>(C); if (!CI1) return nullptr; @@ -2112,46 +2106,36 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } // ((A|B)&1)|(B&-2) -> (A&1) | B - if (match(A, m_Or(m_Value(V1), m_Specific(B))) || - match(A, m_Or(m_Specific(B), m_Value(V1)))) { - Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C); - if (Ret) return Ret; + if (match(A, m_c_Or(m_Value(V1), m_Specific(B)))) { + if (Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C, Builder)) + return Ret; } // (B&-2)|((A|B)&1) -> (A&1) | B - if (match(B, m_Or(m_Specific(A), m_Value(V1))) || - match(B, m_Or(m_Value(V1), m_Specific(A)))) { - Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D); - if (Ret) return Ret; + if (match(B, m_c_Or(m_Specific(A), m_Value(V1)))) { + if (Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D, Builder)) + return Ret; } // ((A^B)&1)|(B&-2) -> (A&1) ^ B - if (match(A, m_Xor(m_Value(V1), m_Specific(B))) || - match(A, m_Xor(m_Specific(B), m_Value(V1)))) { - Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C); - if (Ret) return Ret; + if (match(A, m_c_Xor(m_Value(V1), m_Specific(B)))) { + if (Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C, Builder)) + return Ret; } // (B&-2)|((A^B)&1) -> (A&1) ^ B - if (match(B, m_Xor(m_Specific(A), m_Value(V1))) || - match(B, m_Xor(m_Value(V1), m_Specific(A)))) { - Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D); - if (Ret) return Ret; + if (match(B, m_c_Xor(m_Specific(A), m_Value(V1)))) { + if (Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D, Builder)) + return Ret; } } // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C - // FIXME: The two hasOneUse calls here are the same call, maybe we were - // supposed to check Op1->operand(0)? if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) - if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse()) - return BinaryOperator::CreateOr(Op0, C); + return BinaryOperator::CreateOr(Op0, C); // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C - // FIXME: The two hasOneUse calls here are the same call, maybe we were - // supposed to check Op0->operand(0)? if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) - if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse()) - return BinaryOperator::CreateOr(Op1, C); + return BinaryOperator::CreateOr(Op1, C); // ((B | C) & A) | B -> B | (A & C) if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) @@ -2357,6 +2341,30 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { } } + // Instead of trying to imitate the folds for and/or, decompose this 'xor' + // into those logic ops. That is, try to turn this into an and-of-icmps + // because we have many folds for that pattern. + // + // This is based on a truth table definition of xor: + // X ^ Y --> (X | Y) & !(X & Y) + if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) { + // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y). + // TODO: If OrICmp is false, the whole thing is false (InstSimplify?). + if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) { + // TODO: Independently handle cases where the 'and' side is a constant. + if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) { + // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS + RHS->setPredicate(RHS->getInversePredicate()); + return Builder->CreateAnd(LHS, RHS); + } + if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) { + // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS + LHS->setPredicate(LHS->getInversePredicate()); + return Builder->CreateAnd(LHS, RHS); + } + } + } + return nullptr; } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index c0830a5d2112..dbed7ad4eae8 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1409,6 +1409,47 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { } } + // Add range metadata since known bits can't completely reflect what we know. + // TODO: Handle splat vectors. + auto *IT = dyn_cast<IntegerType>(Op0->getType()); + if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), + ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; + II.setMetadata(LLVMContext::MD_range, + MDNode::get(II.getContext(), LowAndHigh)); + return &II; + } + + return nullptr; +} + +static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { + assert(II.getIntrinsicID() == Intrinsic::ctpop && + "Expected ctpop intrinsic"); + Value *Op0 = II.getArgOperand(0); + // FIXME: Try to simplify vectors of integers. + auto *IT = dyn_cast<IntegerType>(Op0->getType()); + if (!IT) + return nullptr; + + unsigned BitWidth = IT->getBitWidth(); + KnownBits Known(BitWidth); + IC.computeKnownBits(Op0, Known, 0, &II); + + unsigned MinCount = Known.countMinPopulation(); + unsigned MaxCount = Known.countMaxPopulation(); + + // Add range metadata since known bits can't completely reflect what we know. + if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), + ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; + II.setMetadata(LLVMContext::MD_range, + MDNode::get(II.getContext(), LowAndHigh)); + return &II; + } + return nullptr; } @@ -1981,6 +2022,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return I; break; + case Intrinsic::ctpop: + if (auto *I = foldCtpop(*II, *this)) + return I; + break; + case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::umul_with_overflow: diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 38e95fb11639..d3049389dfb9 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1896,6 +1896,18 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast, return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X); } + // Canonicalize vector bitcasts to come before vector bitwise logic with a + // constant. This eases recognition of special constants for later ops. + // Example: + // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b + Constant *C; + if (match(BO->getOperand(1), m_Constant(C))) { + // bitcast (logic X, C) --> logic (bitcast X, C') + Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); + Value *CastedC = ConstantExpr::getBitCast(C, DestTy); + return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC); + } + return nullptr; } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 1ef4acfb058c..6ad32490a328 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2434,6 +2434,77 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, return nullptr; } +bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, + Value *&RHS, ConstantInt *&Less, + ConstantInt *&Equal, + ConstantInt *&Greater) { + // TODO: Generalize this to work with other comparison idioms or ensure + // they get canonicalized into this form. + + // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32 + // Greater), where Equal, Less and Greater are placeholders for any three + // constants. + ICmpInst::Predicate PredA, PredB; + if (match(SI->getTrueValue(), m_ConstantInt(Equal)) && + match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) && + PredA == ICmpInst::ICMP_EQ && + match(SI->getFalseValue(), + m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)), + m_ConstantInt(Less), m_ConstantInt(Greater))) && + PredB == ICmpInst::ICMP_SLT) { + return true; + } + return false; +} + +Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp, + Instruction *Select, + ConstantInt *C) { + + assert(C && "Cmp RHS should be a constant int!"); + // If we're testing a constant value against the result of a three way + // comparison, the result can be expressed directly in terms of the + // original values being compared. Note: We could possibly be more + // aggressive here and remove the hasOneUse test. The original select is + // really likely to simplify or sink when we remove a test of the result. + Value *OrigLHS, *OrigRHS; + ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan; + if (Cmp.hasOneUse() && + matchThreeWayIntCompare(cast<SelectInst>(Select), OrigLHS, OrigRHS, + C1LessThan, C2Equal, C3GreaterThan)) { + assert(C1LessThan && C2Equal && C3GreaterThan); + + bool TrueWhenLessThan = + ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C) + ->isAllOnesValue(); + bool TrueWhenEqual = + ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C) + ->isAllOnesValue(); + bool TrueWhenGreaterThan = + ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C) + ->isAllOnesValue(); + + // This generates the new instruction that will replace the original Cmp + // Instruction. Instead of enumerating the various combinations when + // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus + // false, we rely on chaining of ORs and future passes of InstCombine to + // simplify the OR further (i.e. a s< b || a == b becomes a s<= b). + + // When none of the three constants satisfy the predicate for the RHS (C), + // the entire original Cmp can be simplified to a false. + Value *Cond = Builder->getFalse(); + if (TrueWhenLessThan) + Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS)); + if (TrueWhenEqual) + Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS)); + if (TrueWhenGreaterThan) + Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS)); + + return replaceInstUsesWith(Cmp, Cond); + } + return nullptr; +} + /// Try to fold integer comparisons with a constant operand: icmp Pred X, C /// where X is some kind of instruction. Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { @@ -2493,11 +2564,28 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { return I; } + // Match against CmpInst LHS being instructions other than binary operators. Instruction *LHSI; - if (match(Cmp.getOperand(0), m_Instruction(LHSI)) && - LHSI->getOpcode() == Instruction::Trunc) - if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C)) - return I; + if (match(Cmp.getOperand(0), m_Instruction(LHSI))) { + switch (LHSI->getOpcode()) { + case Instruction::Select: + { + // For now, we only support constant integers while folding the + // ICMP(SELECT)) pattern. We can extend this to support vector of integers + // similar to the cases handled by binary ops above. + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1))) + if (Instruction *I = foldICmpSelectConstant(Cmp, LHSI, ConstRHS)) + return I; + break; + } + case Instruction::Trunc: + if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C)) + return I; + break; + default: + break; + } + } if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C)) return I; @@ -3110,8 +3198,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { if (BO0) { // Transform A & (L - 1) `ult` L --> L != 0 auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); - auto BitwiseAnd = - m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); + auto BitwiseAnd = m_c_And(m_Value(), LSubOne); if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) { auto *Zero = Constant::getNullValue(BO0->getType()); diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 1a7db146df42..1b0fe84dd4dd 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -95,6 +95,18 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) { } } +/// Return the source operand of a potentially bitcasted value while optionally +/// checking if it has one use. If there is no bitcast or the one use check is +/// not met, return the input value itself. +static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) { + if (auto *BitCast = dyn_cast<BitCastInst>(V)) + if (!OneUseOnly || BitCast->hasOneUse()) + return BitCast->getOperand(0); + + // V is not a bitcast or V has more than one use and OneUseOnly is true. + return V; +} + /// \brief Add one to a Constant static inline Constant *AddOne(Constant *C) { return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); @@ -276,10 +288,6 @@ public: Instruction *visitFDiv(BinaryOperator &I); Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted); Instruction *visitAnd(BinaryOperator &I); - Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A, - Value *B, Value *C); - Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A, - Value *B, Value *C); Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); @@ -595,6 +603,15 @@ private: Instruction::BinaryOps, Value *, Value *, Value *, Value *); + /// Match a select chain which produces one of three values based on whether + /// the LHS is less than, equal to, or greater than RHS respectively. + /// Return true if we matched a three way compare idiom. The LHS, RHS, Less, + /// Equal and Greater values are saved in the matching process and returned to + /// the caller. + bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS, + ConstantInt *&Less, ConstantInt *&Equal, + ConstantInt *&Greater); + /// \brief Attempts to replace V with a simpler value based on the demanded /// bits. Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known, @@ -672,6 +689,8 @@ private: Instruction *foldICmpBinOp(ICmpInst &Cmp); Instruction *foldICmpEquality(ICmpInst &Cmp); + Instruction *foldICmpSelectConstant(ICmpInst &Cmp, Instruction *Select, + ConstantInt *C); Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc, const APInt *C); Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And, diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index a4d84ae81aa0..ca370c73fca4 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -169,6 +169,18 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI, return nullptr; } +/// Returns true if V is dereferenceable for size of alloca. +static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, + const DataLayout &DL) { + if (AI->isArrayAllocation()) + return false; + uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); + if (!AllocaSize) + return false; + return isDereferenceableAndAlignedPointer(V, AI->getAlignment(), + APInt(64, AllocaSize), DL); +} + static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { // Check for array size of 1 (scalar allocation). if (!AI.isArrayAllocation()) { @@ -390,7 +402,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { unsigned SourceAlign = getOrEnforceKnownAlignment( Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT); - if (AI.getAlignment() <= SourceAlign) { + if (AI.getAlignment() <= SourceAlign && + isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) { DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) @@ -476,21 +489,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT break; case LLVMContext::MD_nonnull: - // This only directly applies if the new type is also a pointer. - if (NewTy->isPointerTy()) { - NewLoad->setMetadata(ID, N); - break; - } - // If it's integral now, translate it to !range metadata. - if (NewTy->isIntegerTy()) { - auto *ITy = cast<IntegerType>(NewTy); - auto *NullInt = ConstantExpr::getPtrToInt( - ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); - auto *NonNullInt = - ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); - NewLoad->setMetadata(LLVMContext::MD_range, - MDB.createRange(NonNullInt, NullInt)); - } + copyNonnullMetadata(LI, N, *NewLoad); break; case LLVMContext::MD_align: case LLVMContext::MD_dereferenceable: @@ -500,17 +499,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT NewLoad->setMetadata(ID, N); break; case LLVMContext::MD_range: - // FIXME: It would be nice to propagate this in some way, but the type - // conversions make it hard. - - // If it's a pointer now and the range does not contain 0, make it !nonnull. - if (NewTy->isPointerTy()) { - unsigned BitWidth = IC.getDataLayout().getTypeSizeInBits(NewTy); - if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { - MDNode *NN = MDNode::get(LI.getContext(), None); - NewLoad->setMetadata(LLVMContext::MD_nonnull, NN); - } - } + copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad); break; } } diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index b9674d85634d..33951e66497a 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -303,7 +303,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, /// We want to turn: /// (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) /// into: -/// (or (shl (and X, C1), C3), y) +/// (or (shl (and X, C1), C3), Y) /// iff: /// C1 and C2 are both powers of 2 /// where: @@ -317,19 +317,44 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, Value *FalseVal, InstCombiner::BuilderTy *Builder) { const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); - if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) + if (!IC || !SI.getType()->isIntegerTy()) return nullptr; Value *CmpLHS = IC->getOperand(0); Value *CmpRHS = IC->getOperand(1); - if (!match(CmpRHS, m_Zero())) - return nullptr; + Value *V; + unsigned C1Log; + bool IsEqualZero; + bool NeedAnd = false; + if (IC->isEquality()) { + if (!match(CmpRHS, m_Zero())) + return nullptr; + + const APInt *C1; + if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) + return nullptr; + + V = CmpLHS; + C1Log = C1->logBase2(); + IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ; + } else if (IC->getPredicate() == ICmpInst::ICMP_SLT || + IC->getPredicate() == ICmpInst::ICMP_SGT) { + // We also need to recognize (icmp slt (trunc (X)), 0) and + // (icmp sgt (trunc (X)), -1). + IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT; + if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) || + (!IsEqualZero && !match(CmpRHS, m_Zero()))) + return nullptr; + + if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V))))) + return nullptr; - Value *X; - const APInt *C1; - if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1)))) + C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1; + NeedAnd = true; + } else { return nullptr; + } const APInt *C2; bool OrOnTrueVal = false; @@ -340,11 +365,27 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, if (!OrOnFalseVal && !OrOnTrueVal) return nullptr; - Value *V = CmpLHS; Value *Y = OrOnFalseVal ? TrueVal : FalseVal; - unsigned C1Log = C1->logBase2(); unsigned C2Log = C2->logBase2(); + + bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal); + bool NeedShift = C1Log != C2Log; + bool NeedZExtTrunc = Y->getType()->getIntegerBitWidth() != + V->getType()->getIntegerBitWidth(); + + // Make sure we don't create more instructions than we save. + Value *Or = OrOnFalseVal ? FalseVal : TrueVal; + if ((NeedShift + NeedXor + NeedZExtTrunc) > + (IC->hasOneUse() + Or->hasOneUse())) + return nullptr; + + if (NeedAnd) { + // Insert the AND instruction on the input to the truncate. + APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log); + V = Builder->CreateAnd(V, ConstantInt::get(V->getType(), C1)); + } + if (C2Log > C1Log) { V = Builder->CreateZExtOrTrunc(V, Y->getType()); V = Builder->CreateShl(V, C2Log - C1Log); @@ -354,9 +395,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, } else V = Builder->CreateZExtOrTrunc(V, Y->getType()); - ICmpInst::Predicate Pred = IC->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) || - (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal)) + if (NeedXor) V = Builder->CreateXor(V, *C2); return Builder->CreateOr(V, Y); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 8cec865c6422..1bb1a85367d1 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -556,8 +556,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { // The inexact versions are deferred to DAGCombine, so we don't hide shl // behind a bit mask. const APInt *ShOp1; - if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShOp1))), - m_Exact(m_AShr(m_Value(X), m_APInt(ShOp1)))))) { + if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) { unsigned ShrAmt = ShOp1->getZExtValue(); if (ShrAmt < ShAmt) { // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1) diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 65e6d2e35905..02fac4fb37a4 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -939,9 +939,19 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) { // `TrueVInPred`. if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC)) InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; - else + else { + // Generate the select in the same block as PN's current incoming block. + // Note: ThisBB need not be the NonConstBB because vector constants + // which are constants by definition are handled here. + // FIXME: This can lead to an increase in IR generation because we might + // generate selects for vector constant phi operand, that could not be + // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For + // non-vector phis, this transformation was always profitable because + // the select would be generated exactly once in the NonConstBB. + Builder->SetInsertPoint(ThisBB->getTerminator()); InV = Builder->CreateSelect(PN->getIncomingValue(i), TrueVInPred, FalseVInPred, "phitmp"); + } NewPN->addIncoming(InV, ThisBB); } } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { @@ -3002,6 +3012,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, ++NumDeadInst; DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); Inst->eraseFromParent(); + MadeIRChange = true; continue; } @@ -3015,6 +3026,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, ++NumConstProp; if (isInstructionTriviallyDead(Inst, TLI)) Inst->eraseFromParent(); + MadeIRChange = true; continue; } diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 0d308810009d..4089d81ea3e1 100644 --- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -642,7 +642,12 @@ static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) { if (DisableICP) return false; InstrProfSymtab Symtab; - Symtab.create(M, InLTO); + if (Error E = Symtab.create(M, InLTO)) { + std::string SymtabFailure = toString(std::move(E)); + DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n"); + (void)SymtabFailure; + return false; + } bool Changed = false; for (auto &F : M) { if (F.isDeclaration()) diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp index 37f88d5f95f1..9c14b0149fdc 100644 --- a/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -19,12 +19,14 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" @@ -40,7 +42,10 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -92,6 +97,35 @@ cl::opt<double> NumCountersPerValueSite( // is usually smaller than 2. cl::init(1.0)); +cl::opt<bool> AtomicCounterUpdatePromoted( + "atomic-counter-update-promoted", cl::ZeroOrMore, + cl::desc("Do counter update using atomic fetch add " + " for promoted counters only"), + cl::init(false)); + +// If the option is not specified, the default behavior about whether +// counter promotion is done depends on how instrumentaiton lowering +// pipeline is setup, i.e., the default value of true of this option +// does not mean the promotion will be done by default. Explicitly +// setting this option can override the default behavior. +cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore, + cl::desc("Do counter register promotion"), + cl::init(false)); +cl::opt<unsigned> MaxNumOfPromotionsPerLoop( + cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(10), + cl::desc("Max number counter promotions per loop to avoid" + " increasing register pressure too much")); + +// A debug option +cl::opt<int> + MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1), + cl::desc("Max number of allowed counter promotions")); + +cl::opt<bool> SpeculativeCounterPromotion( + cl::ZeroOrMore, "speculative-counter-promotion", cl::init(false), + cl::desc("Allow counter promotion for loops with multiple exiting blocks " + " or top-tested loops. ")); + class InstrProfilingLegacyPass : public ModulePass { InstrProfiling InstrProf; @@ -116,6 +150,123 @@ public: } }; +/// A helper class to promote one counter RMW operation in the loop +/// into register update. +/// +/// RWM update for the counter will be sinked out of the loop after +/// the transformation. +/// +class PGOCounterPromoterHelper : public LoadAndStorePromoter { +public: + PGOCounterPromoterHelper(Instruction *L, Instruction *S, SSAUpdater &SSA, + Value *Init, BasicBlock *PH, + ArrayRef<BasicBlock *> ExitBlocks, + ArrayRef<Instruction *> InsertPts) + : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks), + InsertPts(InsertPts) { + assert(isa<LoadInst>(L)); + assert(isa<StoreInst>(S)); + SSA.AddAvailableValue(PH, Init); + } + void doExtraRewritesBeforeFinalDeletion() const override { + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = ExitBlocks[i]; + Instruction *InsertPos = InsertPts[i]; + // Get LiveIn value into the ExitBlock. If there are multiple + // predecessors, the value is defined by a PHI node in this + // block. + Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + Value *Addr = cast<StoreInst>(Store)->getPointerOperand(); + IRBuilder<> Builder(InsertPos); + if (AtomicCounterUpdatePromoted) + Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue, + AtomicOrdering::SequentiallyConsistent); + else { + LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted"); + auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue); + Builder.CreateStore(NewVal, Addr); + } + } + } + +private: + Instruction *Store; + ArrayRef<BasicBlock *> ExitBlocks; + ArrayRef<Instruction *> InsertPts; +}; + +/// A helper class to do register promotion for all profile counter +/// updates in a loop. +/// +class PGOCounterPromoter { +public: + PGOCounterPromoter(ArrayRef<LoadStorePair> Cands, Loop &Loop) + : Candidates(Cands), ExitBlocks(), InsertPts(), ParentLoop(Loop) { + + SmallVector<BasicBlock *, 8> LoopExitBlocks; + SmallPtrSet<BasicBlock *, 8> BlockSet; + ParentLoop.getExitBlocks(LoopExitBlocks); + + for (BasicBlock *ExitBlock : LoopExitBlocks) { + if (BlockSet.insert(ExitBlock).second) { + ExitBlocks.push_back(ExitBlock); + InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); + } + } + } + + bool run(int64_t *NumPromoted) { + // We can't insert into a catchswitch. + bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { + return isa<CatchSwitchInst>(Exit->getTerminator()); + }); + + if (HasCatchSwitch) + return false; + + if (!ParentLoop.hasDedicatedExits()) + return false; + + BasicBlock *PH = ParentLoop.getLoopPreheader(); + if (!PH) + return false; + + BasicBlock *H = ParentLoop.getHeader(); + bool TopTested = + ((ParentLoop.getBlocks().size() > 1) && ParentLoop.isLoopExiting(H)); + if (!SpeculativeCounterPromotion && + (TopTested || ParentLoop.getExitingBlock() == nullptr)) + return false; + + unsigned Promoted = 0; + for (auto &Cand : Candidates) { + + SmallVector<PHINode *, 4> NewPHIs; + SSAUpdater SSA(&NewPHIs); + Value *InitVal = ConstantInt::get(Cand.first->getType(), 0); + PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal, + PH, ExitBlocks, InsertPts); + Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second})); + Promoted++; + if (Promoted >= MaxNumOfPromotionsPerLoop) + break; + (*NumPromoted)++; + if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions) + break; + } + + DEBUG(dbgs() << Promoted << " counters promoted for loop (depth=" + << ParentLoop.getLoopDepth() << ")\n"); + return Promoted != 0; + } + +private: + ArrayRef<LoadStorePair> Candidates; + SmallVector<BasicBlock *, 8> ExitBlocks; + SmallVector<Instruction *, 8> InsertPts; + Loop &ParentLoop; +}; + } // end anonymous namespace PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { @@ -147,6 +298,63 @@ static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) { return dyn_cast<InstrProfIncrementInst>(Instr); } +bool InstrProfiling::lowerIntrinsics(Function *F) { + bool MadeChange = false; + PromotionCandidates.clear(); + for (BasicBlock &BB : *F) { + for (auto I = BB.begin(), E = BB.end(); I != E;) { + auto Instr = I++; + InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr); + if (Inc) { + lowerIncrement(Inc); + MadeChange = true; + } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { + lowerValueProfileInst(Ind); + MadeChange = true; + } + } + } + + if (!MadeChange) + return false; + + promoteCounterLoadStores(F); + return true; +} + +bool InstrProfiling::isCounterPromotionEnabled() const { + if (DoCounterPromotion.getNumOccurrences() > 0) + return DoCounterPromotion; + + return Options.DoCounterPromotion; +} + +void InstrProfiling::promoteCounterLoadStores(Function *F) { + if (!isCounterPromotionEnabled()) + return; + + DominatorTree DT(*F); + LoopInfo LI(DT); + DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates; + + for (const auto &LoadStore : PromotionCandidates) { + auto *CounterLoad = LoadStore.first; + auto *CounterStore = LoadStore.second; + BasicBlock *BB = CounterLoad->getParent(); + Loop *ParentLoop = LI.getLoopFor(BB); + if (!ParentLoop) + continue; + LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore); + } + + SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder(); + + for (auto *Loop : Loops) { + PGOCounterPromoter Promoter(LoopPromotionCandidates[Loop], *Loop); + Promoter.run(&TotalCountersPromoted); + } +} + bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) { bool MadeChange = false; @@ -179,18 +387,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) { } for (Function &F : M) - for (BasicBlock &BB : F) - for (auto I = BB.begin(), E = BB.end(); I != E;) { - auto Instr = I++; - InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr); - if (Inc) { - lowerIncrement(Inc); - MadeChange = true; - } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { - lowerValueProfileInst(Ind); - MadeChange = true; - } - } + MadeChange |= lowerIntrinsics(&F); if (GlobalVariable *CoverageNamesVar = M.getNamedGlobal(getCoverageUnusedNamesVarName())) { @@ -303,9 +500,12 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { IRBuilder<> Builder(Inc); uint64_t Index = Inc->getIndex()->getZExtValue(); Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); - Value *Count = Builder.CreateLoad(Addr, "pgocount"); - Count = Builder.CreateAdd(Count, Inc->getStep()); - Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr)); + Value *Load = Builder.CreateLoad(Addr, "pgocount"); + auto *Count = Builder.CreateAdd(Load, Inc->getStep()); + auto *Store = Builder.CreateStore(Count, Addr); + Inc->replaceAllUsesWith(Store); + if (isCounterPromotionEnabled()) + PromotionCandidates.emplace_back(cast<Instruction>(Load), Store); Inc->eraseFromParent(); } diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index b2d95271479c..0e7d11c55397 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1177,7 +1177,7 @@ void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) { Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), - Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty), + Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty), Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)}); ++CurCtrId; } diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index f5196cc46181..457c9427ab9a 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -22,7 +22,6 @@ add_llvm_library(LLVMScalarOpts LICM.cpp LoopAccessAnalysisPrinter.cpp LoopSink.cpp - LoadCombine.cpp LoopDeletion.cpp LoopDataPrefetch.cpp LoopDistribute.cpp diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 2a4c9526dfcd..28157783daa7 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -232,8 +232,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { pred_iterator PB = pred_begin(BB), PE = pred_end(BB); if (PB == PE) return false; - // Analyse each switch case in turn. This is done in reverse order so that - // removing a case doesn't cause trouble for the iteration. + // Analyse each switch case in turn. bool Changed = false; for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); @@ -291,7 +290,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { break; } - // Increment the case iterator sense we didn't delete it. + // Increment the case iterator since we didn't delete it. ++CI; } diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0490d93f6455..c0f628eb61e6 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; + bool commutative; SmallVector<uint32_t, 4> varargs; - Expression(uint32_t o = ~2U) : opcode(o) {} + Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {} bool operator==(const Expression &other) const { if (opcode != other.opcode) @@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); + e.commutative = true; } if (CmpInst *C = dyn_cast<CmpInst>(I)) { @@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; + e.commutative = true; } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; + e.commutative = true; return e; } @@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); + if (PHINode *PN = dyn_cast<PHINode>(V)) + NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) { - e = nextValueNumber++; - valueNumbering[C] = e; - return e; + auto ValNum = assignExpNewValueNum(exp); + if (ValNum.second) { + valueNumbering[C] = ValNum.first; + return ValNum.first; } if (!MD) { - e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } @@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; + case Instruction::PHI: + valueNumbering[V] = nextValueNumber; + NumberingPhi[nextValueNumber] = cast<PHINode>(V); + return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; + if (Verify) { + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; + } + return (VI != valueNumbering.end()) ? VI->second : 0; } /// Returns the value number of the given comparison, @@ -549,21 +559,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - return e; + return assignExpNewValueNum(exp).first; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); + NumberingPhi.clear(); + PhiTranslateTable.clear(); nextValueNumber = 1; + Expressions.clear(); + ExprIdx.clear(); + nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { + uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); + // If V is PHINode, V <--> value number is an one-to-one mapping. + if (isa<PHINode>(V)) + NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -602,7 +619,7 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) { +LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), E = d.end(); I != E; ++I) { @@ -1451,6 +1468,95 @@ bool GVN::processLoad(LoadInst *L) { return false; } +/// Return a pair the first field showing the value number of \p Exp and the +/// second field showing whether it is a value number newly created. +std::pair<uint32_t, bool> +GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { + uint32_t &e = expressionNumbering[Exp]; + bool CreateNewValNum = !e; + if (CreateNewValNum) { + Expressions.push_back(Exp); + if (ExprIdx.size() < nextValueNumber + 1) + ExprIdx.resize(nextValueNumber * 2); + e = nextValueNumber; + ExprIdx[nextValueNumber++] = nextExprNumber++; + } + return {e, CreateNewValNum}; +} + +/// Return whether all the values related with the same \p num are +/// defined in \p BB. +bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, + GVN &Gvn) { + LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; + while (Vals && Vals->BB == BB) + Vals = Vals->Next; + return !Vals; +} + +/// Wrap phiTranslateImpl to provide caching functionality. +uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, + const BasicBlock *PhiBlock, uint32_t Num, + GVN &Gvn) { + auto FindRes = PhiTranslateTable.find({Num, Pred}); + if (FindRes != PhiTranslateTable.end()) + return FindRes->second; + uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); + PhiTranslateTable.insert({{Num, Pred}, NewNum}); + return NewNum; +} + +/// Translate value number \p Num using phis, so that it has the values of +/// the phis in BB. +uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn) { + if (PHINode *PN = NumberingPhi[Num]) { + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) + if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) + return TransVal; + } + return Num; + } + + // If there is any value related with Num is defined in a BB other than + // PhiBlock, it cannot depend on a phi in PhiBlock without going through + // a backedge. We can do an early exit in that case to save compile time. + if (!areAllValsInBB(Num, PhiBlock, Gvn)) + return Num; + + if (Num >= ExprIdx.size() || ExprIdx[Num] == 0) + return Num; + Expression Exp = Expressions[ExprIdx[Num]]; + + for (unsigned i = 0; i < Exp.varargs.size(); i++) { + // For InsertValue and ExtractValue, some varargs are index numbers + // instead of value numbers. Those index numbers should not be + // translated. + if ((i > 1 && Exp.opcode == Instruction::InsertValue) || + (i > 0 && Exp.opcode == Instruction::ExtractValue)) + continue; + Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); + } + + if (Exp.commutative) { + assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); + if (Exp.varargs[0] > Exp.varargs[1]) { + std::swap(Exp.varargs[0], Exp.varargs[1]); + uint32_t Opcode = Exp.opcode >> 8; + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) + Exp.opcode = (Opcode << 8) | + CmpInst::getSwappedPredicate( + static_cast<CmpInst::Predicate>(Exp.opcode & 255)); + } + } + + if (uint32_t NewNum = expressionNumbering[Exp]) + return NewNum; + return Num; +} + // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1495,6 +1601,15 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } + +void GVN::assignBlockRPONumber(Function &F) { + uint32_t NextBlockNumber = 1; + ReversePostOrderTraversal<Function *> RPOT(&F); + for (BasicBlock *BB : RPOT) + BlockRPONumber[BB] = NextBlockNumber++; +} + + // Tries to replace instruction with const, using information from // ReplaceWithConstMap. bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { @@ -1856,6 +1971,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); + assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -1927,7 +2043,7 @@ bool GVN::processBlock(BasicBlock *BB) { // Instantiate an expression in a predecessor that lacked it. bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, - unsigned int ValNo) { + BasicBlock *Curr, unsigned int ValNo) { // Because we are going top-down through the block, all value numbers // will be available in the predecessor by the time we need them. Any // that weren't originally present will have been instantiated earlier @@ -1945,7 +2061,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - if (Value *V = findLeader(Pred, VN.lookup(Op))) { + uint32_t TValNo = + VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this); + if (Value *V = findLeader(Pred, TValNo)) { Instr->setOperand(i, V); } else { success = false; @@ -1962,10 +2080,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - VN.add(Instr, ValNo); + + unsigned Num = VN.lookupOrAdd(Instr); + VN.add(Instr, Num); // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, Instr, Pred); + addToLeaderTable(Num, Instr, Pred); return true; } @@ -2003,18 +2123,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap; for (BasicBlock *P : predecessors(CurrentBlock)) { - // We're not interested in PRE where the block is its - // own predecessor, or in blocks with predecessors - // that are not reachable. - if (P == CurrentBlock) { + // We're not interested in PRE where blocks with predecessors that are + // not reachable. + if (!DT->isReachableFromEntry(P)) { NumWithout = 2; break; - } else if (!DT->isReachableFromEntry(P)) { + } + // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and + // when CurInst has operand defined in CurrentBlock (so it may be defined + // by phi in the loop header). + if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] && + any_of(CurInst->operands(), [&](const Use &U) { + if (auto *Inst = dyn_cast<Instruction>(U.get())) + return Inst->getParent() == CurrentBlock; + return false; + })) { NumWithout = 2; break; } - Value *predV = findLeader(P, ValNo); + uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); + Value *predV = findLeader(P, TValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P)); PREPred = P; @@ -2054,7 +2183,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { } // We need to insert somewhere, so let's give it a shot PREInstr = CurInst->clone(); - if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { + if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) { // If we failed insertion, make sure we remove the instruction. DEBUG(verifyRemoved(PREInstr)); PREInstr->deleteValue(); @@ -2168,6 +2297,7 @@ bool GVN::iterateOnFunction(Function &F) { void GVN::cleanupGlobalSets() { VN.clear(); LeaderTable.clear(); + BlockRPONumber.clear(); TableAllocator.Reset(); } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index c120036464d0..05293eb0079f 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" @@ -576,7 +577,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { assert(Preference == WantInteger && "Compares only produce integers"); - PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); + Type *CmpType = Cmp->getType(); + Value *CmpLHS = Cmp->getOperand(0); + Value *CmpRHS = Cmp->getOperand(1); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + PHINode *PN = dyn_cast<PHINode>(CmpLHS); if (PN && PN->getParent() == BB) { const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. @@ -584,15 +590,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = PN->getIncomingBlock(i); Value *LHS = PN->getIncomingValue(i); - Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); + Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB); - Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, {DL}); + Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); if (!Res) { if (!isa<Constant>(RHS)) continue; LazyValueInfo::Tristate - ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, + ResT = LVI->getPredicateOnEdge(Pred, LHS, cast<Constant>(RHS), PredBB, BB, CxtI ? CxtI : Cmp); if (ResT == LazyValueInfo::Unknown) @@ -609,27 +615,67 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // If comparing a live-in value against a constant, see if we know the // live-in value on any predecessors. - if (isa<Constant>(Cmp->getOperand(1)) && !Cmp->getType()->isVectorTy()) { - Constant *CmpConst = cast<Constant>(Cmp->getOperand(1)); + if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) { + Constant *CmpConst = cast<Constant>(CmpRHS); - if (!isa<Instruction>(Cmp->getOperand(0)) || - cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { + if (!isa<Instruction>(CmpLHS) || + cast<Instruction>(CmpLHS)->getParent() != BB) { for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. LazyValueInfo::Tristate Res = - LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), + LVI->getPredicateOnEdge(Pred, CmpLHS, CmpConst, P, BB, CxtI ? CxtI : Cmp); if (Res == LazyValueInfo::Unknown) continue; - Constant *ResC = ConstantInt::get(Cmp->getType(), Res); + Constant *ResC = ConstantInt::get(CmpType, Res); Result.push_back(std::make_pair(ResC, P)); } return !Result.empty(); } + // InstCombine can fold some forms of constant range checks into + // (icmp (add (x, C1)), C2). See if we have we have such a thing with + // x as a live-in. + { + using namespace PatternMatch; + Value *AddLHS; + ConstantInt *AddConst; + if (isa<ConstantInt>(CmpConst) && + match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { + if (!isa<Instruction>(AddLHS) || + cast<Instruction>(AddLHS)->getParent() != BB) { + for (BasicBlock *P : predecessors(BB)) { + // If the value is known by LazyValueInfo to be a ConstantRange in + // a predecessor, use that information to try to thread this + // block. + ConstantRange CR = LVI->getConstantRangeOnEdge( + AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS)); + // Propagate the range through the addition. + CR = CR.add(AddConst->getValue()); + + // Get the range where the compare returns true. + ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( + Pred, cast<ConstantInt>(CmpConst)->getValue()); + + Constant *ResC; + if (CmpRange.contains(CR)) + ResC = ConstantInt::getTrue(CmpType); + else if (CmpRange.inverse().contains(CR)) + ResC = ConstantInt::getFalse(CmpType); + else + continue; + + Result.push_back(std::make_pair(ResC, P)); + } + + return !Result.empty(); + } + } + } + // Try to find a constant value for the LHS of a comparison, // and evaluate it statically if we can. PredValueInfoTy LHSVals; @@ -638,8 +684,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; - Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), - V, CmpConst); + Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); if (Constant *KC = getKnownConstant(Folded, WantInteger)) Result.push_back(std::make_pair(KC, LHSVal.second)); } @@ -752,6 +797,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); + // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by + // BB code within one basic block `BB`), we need to invalidate the LVI + // information associated with BB, because the LVI information need not be + // true for all of BB after the merge. For example, + // Before the merge, LVI info and code is as follows: + // SinglePred: <LVI info1 for %p val> + // %y = use of %p + // call @exit() // need not transfer execution to successor. + // assume(%p) // from this point on %p is true + // br label %BB + // BB: <LVI info2 for %p val, i.e. %p is true> + // %x = use of %p + // br label exit + // + // Note that this LVI info for blocks BB and SinglPred is correct for %p + // (info2 and info1 respectively). After the merge and the deletion of the + // LVI info1 for SinglePred. We have the following code: + // BB: <LVI info2 for %p val> + // %y = use of %p + // call @exit() + // assume(%p) + // %x = use of %p <-- LVI info2 is correct from here onwards. + // br label exit + // LVI info2 for BB is incorrect at the beginning of BB. + + // Invalidate LVI information for BB if the LVI is not provably true for + // all of BB. + if (any_of(*BB, [](Instruction &I) { + return !isGuaranteedToTransferExecutionToSuccessor(&I); + })) + LVI->eraseBlock(BB); return true; } } diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp deleted file mode 100644 index 025ba1bfedc1..000000000000 --- a/lib/Transforms/Scalar/LoadCombine.cpp +++ /dev/null @@ -1,295 +0,0 @@ -//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This transformation combines adjacent loads. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/TargetFolder.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" - -using namespace llvm; - -#define DEBUG_TYPE "load-combine" - -STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); -STATISTIC(NumLoadsCombined, "Number of loads combined"); - -#define LDCOMBINE_NAME "Combine Adjacent Loads" - -namespace { -struct PointerOffsetPair { - Value *Pointer; - APInt Offset; -}; - -struct LoadPOPPair { - LoadInst *Load; - PointerOffsetPair POP; - /// \brief The new load needs to be created before the first load in IR order. - unsigned InsertOrder; -}; - -class LoadCombine : public BasicBlockPass { - LLVMContext *C; - AliasAnalysis *AA; - DominatorTree *DT; - -public: - LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { - initializeLoadCombinePass(*PassRegistry::getPassRegistry()); - } - - using llvm::Pass::doInitialization; - bool doInitialization(Function &) override; - bool runOnBasicBlock(BasicBlock &BB) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - - StringRef getPassName() const override { return LDCOMBINE_NAME; } - static char ID; - - typedef IRBuilder<TargetFolder> BuilderTy; - -private: - BuilderTy *Builder; - - PointerOffsetPair getPointerOffsetPair(LoadInst &); - bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); - bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); - bool combineLoads(SmallVectorImpl<LoadPOPPair> &); -}; -} - -bool LoadCombine::doInitialization(Function &F) { - DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); - C = &F.getContext(); - return true; -} - -PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { - auto &DL = LI.getModule()->getDataLayout(); - - PointerOffsetPair POP; - POP.Pointer = LI.getPointerOperand(); - unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace()); - POP.Offset = APInt(BitWidth, 0); - - while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { - if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { - APInt LastOffset = POP.Offset; - if (!GEP->accumulateConstantOffset(DL, POP.Offset)) { - // Can't handle GEPs with variable indices. - POP.Offset = LastOffset; - return POP; - } - POP.Pointer = GEP->getPointerOperand(); - } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) { - POP.Pointer = BC->getOperand(0); - } - } - return POP; -} - -bool LoadCombine::combineLoads( - DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { - bool Combined = false; - for (auto &Loads : LoadMap) { - if (Loads.second.size() < 2) - continue; - std::sort(Loads.second.begin(), Loads.second.end(), - [](const LoadPOPPair &A, const LoadPOPPair &B) { - return A.POP.Offset.slt(B.POP.Offset); - }); - if (aggregateLoads(Loads.second)) - Combined = true; - } - return Combined; -} - -/// \brief Try to aggregate loads from a sorted list of loads to be combined. -/// -/// It is guaranteed that no writes occur between any of the loads. All loads -/// have the same base pointer. There are at least two loads. -bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { - assert(Loads.size() >= 2 && "Insufficient loads!"); - LoadInst *BaseLoad = nullptr; - SmallVector<LoadPOPPair, 8> AggregateLoads; - bool Combined = false; - bool ValidPrevOffset = false; - APInt PrevOffset; - uint64_t PrevSize = 0; - for (auto &L : Loads) { - if (ValidPrevOffset == false) { - BaseLoad = L.Load; - PrevOffset = L.POP.Offset; - PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( - L.Load->getType()); - AggregateLoads.push_back(L); - ValidPrevOffset = true; - continue; - } - if (L.Load->getAlignment() > BaseLoad->getAlignment()) - continue; - APInt PrevEnd = PrevOffset + PrevSize; - if (L.POP.Offset.sgt(PrevEnd)) { - // No other load will be combinable - if (combineLoads(AggregateLoads)) - Combined = true; - AggregateLoads.clear(); - ValidPrevOffset = false; - continue; - } - if (L.POP.Offset != PrevEnd) - // This load is offset less than the size of the last load. - // FIXME: We may want to handle this case. - continue; - PrevOffset = L.POP.Offset; - PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( - L.Load->getType()); - AggregateLoads.push_back(L); - } - if (combineLoads(AggregateLoads)) - Combined = true; - return Combined; -} - -/// \brief Given a list of combinable load. Combine the maximum number of them. -bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { - // Remove loads from the end while the size is not a power of 2. - unsigned TotalSize = 0; - for (const auto &L : Loads) - TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); - while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) - TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); - if (Loads.size() < 2) - return false; - - DEBUG({ - dbgs() << "***** Combining Loads ******\n"; - for (const auto &L : Loads) { - dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; - } - }); - - // Find first load. This is where we put the new load. - LoadPOPPair FirstLP; - FirstLP.InsertOrder = -1u; - for (const auto &L : Loads) - if (L.InsertOrder < FirstLP.InsertOrder) - FirstLP = L; - - unsigned AddressSpace = - FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); - - Builder->SetInsertPoint(FirstLP.Load); - Value *Ptr = Builder->CreateConstGEP1_64( - Builder->CreatePointerCast(Loads[0].POP.Pointer, - Builder->getInt8PtrTy(AddressSpace)), - Loads[0].POP.Offset.getSExtValue()); - LoadInst *NewLoad = new LoadInst( - Builder->CreatePointerCast( - Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), - Ptr->getType()->getPointerAddressSpace())), - Twine(Loads[0].Load->getName()) + ".combined", false, - Loads[0].Load->getAlignment(), FirstLP.Load); - - for (const auto &L : Loads) { - Builder->SetInsertPoint(L.Load); - Value *V = Builder->CreateExtractInteger( - L.Load->getModule()->getDataLayout(), NewLoad, - cast<IntegerType>(L.Load->getType()), - (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract"); - L.Load->replaceAllUsesWith(V); - } - - NumLoadsCombined += Loads.size(); - return true; -} - -bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { - if (skipBasicBlock(BB)) - return false; - - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - // Skip analysing dead blocks (not forward reachable from function entry). - if (!DT->isReachableFromEntry(&BB)) { - DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() << - " in " << BB.getParent()->getName() << "\n"); - return false; - } - - IRBuilder<TargetFolder> TheBuilder( - BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); - Builder = &TheBuilder; - - DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; - AliasSetTracker AST(*AA); - - bool Combined = false; - unsigned Index = 0; - for (auto &I : BB) { - if (I.mayThrow() || AST.containsUnknown(&I)) { - if (combineLoads(LoadMap)) - Combined = true; - LoadMap.clear(); - AST.clear(); - continue; - } - if (I.mayWriteToMemory()) { - AST.add(&I); - continue; - } - LoadInst *LI = dyn_cast<LoadInst>(&I); - if (!LI) - continue; - ++NumLoadsAnalyzed; - if (!LI->isSimple() || !LI->getType()->isIntegerTy()) - continue; - auto POP = getPointerOffsetPair(*LI); - if (!POP.Pointer) - continue; - LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++}); - AST.add(LI); - } - if (combineLoads(LoadMap)) - Combined = true; - return Combined; -} - -char LoadCombine::ID = 0; - -BasicBlockPass *llvm::createLoadCombinePass() { - return new LoadCombine(); -} - -INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 3151ccd279c4..c41cc42db5e2 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -31,20 +31,19 @@ using namespace llvm; STATISTIC(NumDeleted, "Number of loops deleted"); /// This function deletes dead loops. The caller of this function needs to -/// guarantee that the loop is infact dead. Here we handle two kinds of dead +/// guarantee that the loop is infact dead. Here we handle two kinds of dead /// loop. The first kind (\p isLoopDead) is where only invariant values from /// within the loop are used outside of it. The second kind (\p /// isLoopNeverExecuted) is where the loop is provably never executed. We can -/// always remove never executed loops since they will not cause any -/// difference to program behaviour. +/// always remove never executed loops since they will not cause any difference +/// to program behaviour. /// /// This also updates the relevant analysis information in \p DT, \p SE, and \p /// LI. It also updates the loop PM if an updater struct is provided. // TODO: This function will be used by loop-simplifyCFG as well. So, move this // to LoopUtils.cpp static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &LI, bool LoopIsNeverExecuted, - LPMUpdater *Updater = nullptr); + LoopInfo &LI, LPMUpdater *Updater = nullptr); /// Determines if a loop is dead. /// /// This assumes that we've already checked for unique exit and exiting blocks, @@ -168,7 +167,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, BasicBlock *ExitBlock = L->getUniqueExitBlock(); if (ExitBlock && isLoopNeverExecuted(L)) { - deleteDeadLoop(L, DT, SE, LI, true /* LoopIsNeverExecuted */, Updater); + // Set incoming value to undef for phi nodes in the exit block. + BasicBlock::iterator BI = ExitBlock->begin(); + while (PHINode *P = dyn_cast<PHINode>(BI)) { + for (unsigned i = 0; i < P->getNumIncomingValues(); i++) + P->setIncomingValue(i, UndefValue::get(P->getType())); + BI++; + } + deleteDeadLoop(L, DT, SE, LI, Updater); ++NumDeleted; return true; } @@ -196,15 +202,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, if (isa<SCEVCouldNotCompute>(S)) return Changed; - deleteDeadLoop(L, DT, SE, LI, false /* LoopIsNeverExecuted */, Updater); + deleteDeadLoop(L, DT, SE, LI, Updater); ++NumDeleted; return true; } static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &LI, bool LoopIsNeverExecuted, - LPMUpdater *Updater) { + LoopInfo &LI, LPMUpdater *Updater) { assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); auto *Preheader = L->getLoopPreheader(); assert(Preheader && "Preheader should exist!"); @@ -227,6 +232,8 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, auto *ExitBlock = L->getUniqueExitBlock(); assert(ExitBlock && "Should have a unique exit block!"); + assert(L->hasDedicatedExits() && "Loop should have dedicated exits!"); + // Connect the preheader directly to the exit block. // Even when the loop is never executed, we cannot remove the edge from the // source block to the exit block. Consider the case where the unexecuted loop @@ -236,20 +243,28 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, // non-loop, it will be deleted in a future iteration of loop deletion pass. Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock); - SmallVector<BasicBlock *, 4> ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); // Rewrite phis in the exit block to get their inputs from the Preheader // instead of the exiting block. - BasicBlock *ExitingBlock = ExitingBlocks[0]; BasicBlock::iterator BI = ExitBlock->begin(); while (PHINode *P = dyn_cast<PHINode>(BI)) { - int j = P->getBasicBlockIndex(ExitingBlock); - assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); - if (LoopIsNeverExecuted) - P->setIncomingValue(j, UndefValue::get(P->getType())); - P->setIncomingBlock(j, Preheader); - for (unsigned i = 1; i < ExitingBlocks.size(); ++i) - P->removeIncomingValue(ExitingBlocks[i]); + // Set the zero'th element of Phi to be from the preheader and remove all + // other incoming values. Given the loop has dedicated exits, all other + // incoming values must be from the exiting blocks. + int PredIndex = 0; + P->setIncomingBlock(PredIndex, Preheader); + // Removes all incoming values from all other exiting blocks (including + // duplicate values from an exiting block). + // Nuke all entries except the zero'th entry which is the preheader entry. + // NOTE! We need to remove Incoming Values in the reverse order as done + // below, to keep the indices valid for deletion (removeIncomingValues + // updates getNumIncomingValues and shifts all values down into the operand + // being deleted). + for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) + P->removeIncomingValue(e-i, false); + + assert((P->getNumIncomingValues() == 1 && + P->getIncomingBlock(PredIndex) == Preheader) && + "Should have exactly one value and that's from the preheader!"); ++BI; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index b027278b24f2..73436f13c94e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -131,7 +131,7 @@ static cl::opt<bool> EnablePhiElim( // The flag adds instruction count to solutions cost comparision. static cl::opt<bool> InsnsCost( - "lsr-insns-cost", cl::Hidden, cl::init(true), + "lsr-insns-cost", cl::Hidden, cl::init(false), cl::desc("Add instruction count to a LSR cost model")); // Flag to choose how to narrow complex lsr solution diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index cbbd55512c9f..7a7624f77542 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -1244,27 +1244,24 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { // only do this for simple stores, we should expand to cover memcpys, etc. const auto *LastStore = createStoreExpression(SI, StoreRHS); const auto *LastCC = ExpressionToClass.lookup(LastStore); - // Basically, check if the congruence class the store is in is defined by a - // store that isn't us, and has the same value. MemorySSA takes care of - // ensuring the store has the same memory state as us already. - // The RepStoredValue gets nulled if all the stores disappear in a class, so - // we don't need to check if the class contains a store besides us. - if (LastCC && - LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand())) + // We really want to check whether the expression we matched was a store. No + // easy way to do that. However, we can check that the class we found has a + // store, which, assuming the value numbering state is not corrupt, is + // sufficient, because we must also be equivalent to that store's expression + // for it to be in the same class as the load. + if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue()) return LastStore; - deleteExpression(LastStore); // Also check if our value operand is defined by a load of the same memory // location, and the memory state is the same as it was then (otherwise, it // could have been overwritten later. See test32 in // transforms/DeadStoreElimination/simple.ll). - if (auto *LI = - dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) { + if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue())) if ((lookupOperandLeader(LI->getPointerOperand()) == - lookupOperandLeader(SI->getPointerOperand())) && + LastStore->getOperand(0)) && (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) == StoreRHS)) - return createStoreExpression(SI, StoreRHS); - } + return LastStore; + deleteExpression(LastStore); } // If the store is not equivalent to anything, value number it as a store that @@ -2332,9 +2329,7 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) { // see if we know some constant value for it already. Value *NewGVN::findConditionEquivalence(Value *Cond) const { auto Result = lookupOperandLeader(Cond); - if (isa<Constant>(Result)) - return Result; - return nullptr; + return isa<Constant>(Result) ? Result : nullptr; } // Process the outgoing edges of a block for reachability. @@ -3014,14 +3009,27 @@ void NewGVN::verifyIterationSettled(Function &F) { // a no-longer valid StoreExpression. void NewGVN::verifyStoreExpressions() const { #ifndef NDEBUG - DenseSet<std::pair<const Value *, const Value *>> StoreExpressionSet; + // This is the only use of this, and it's not worth defining a complicated + // densemapinfo hash/equality function for it. + std::set< + std::pair<const Value *, + std::tuple<const Value *, const CongruenceClass *, Value *>>> + StoreExpressionSet; for (const auto &KV : ExpressionToClass) { if (auto *SE = dyn_cast<StoreExpression>(KV.first)) { // Make sure a version that will conflict with loads is not already there - auto Res = - StoreExpressionSet.insert({SE->getOperand(0), SE->getMemoryLeader()}); - assert(Res.second && - "Stored expression conflict exists in expression table"); + auto Res = StoreExpressionSet.insert( + {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second, + SE->getStoredValue())}); + bool Okay = Res.second; + // It's okay to have the same expression already in there if it is + // identical in nature. + // This can happen when the leader of the stored value changes over time. + if (!Okay) + Okay = (std::get<1>(Res.first->second) == KV.second) && + (lookupOperandLeader(std::get<2>(Res.first->second)) == + lookupOperandLeader(SE->getStoredValue())); + assert(Okay && "Stored expression conflict exists in expression table"); auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst()); assert(ValueExpr && ValueExpr->equals(*SE) && "StoreExpression in ExpressionToClass is not latest " diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index a20890b22603..6da551bd7efd 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) { I->getOpcode() == Instruction::And)) { Value *V0 = I->getOperand(0); Value *V1 = I->getOperand(1); - if (isa<ConstantInt>(V0)) + const APInt *C; + if (match(V0, PatternMatch::m_APInt(C))) std::swap(V0, V1); - if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) { - ConstPart = C->getValue(); + if (match(V1, PatternMatch::m_APInt(C))) { + ConstPart = *C; SymbolicPart = V0; isOr = (I->getOpcode() == Instruction::Or); return; @@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) { // view the operand as "V | 0" SymbolicPart = V; - ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth()); + ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits()); isOr = true; } @@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { /// Scan backwards and forwards among values with the same rank as element i /// to see if X exists. If X does not exist, return i. This is useful when /// scanning for 'x' when we see '-x' because they both get the same rank. -static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, - Value *X) { +static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops, + unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; unsigned e = Ops.size(); for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { @@ -1134,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode, /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will /// be returned. -static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, +static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, const APInt &ConstOpnd) { - if (ConstOpnd != 0) { - if (!ConstOpnd.isAllOnesValue()) { - LLVMContext &Ctx = Opnd->getType()->getContext(); - Instruction *I; - I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd), - "and.ra", InsertBefore); - I->setDebugLoc(InsertBefore->getDebugLoc()); - return I; - } + if (ConstOpnd.isNullValue()) + return nullptr; + + if (ConstOpnd.isAllOnesValue()) return Opnd; - } - return nullptr; + + Instruction *I = BinaryOperator::CreateAnd( + Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra", + InsertBefore); + I->setDebugLoc(InsertBefore->getDebugLoc()); + return I; } // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" @@ -1163,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, // = ((x | c1) ^ c1) ^ (c1 ^ c2) // = (x & ~c1) ^ (c1 ^ c2) // It is useful only when c1 == c2. - if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) { - if (!Opnd1->getValue()->hasOneUse()) - return false; + if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue()) + return false; - const APInt &C1 = Opnd1->getConstPart(); - if (C1 != ConstOpnd) - return false; + if (!Opnd1->getValue()->hasOneUse()) + return false; - Value *X = Opnd1->getSymbolicPart(); - Res = createAndInstr(I, X, ~C1); - // ConstOpnd was C2, now C1 ^ C2. - ConstOpnd ^= C1; + const APInt &C1 = Opnd1->getConstPart(); + if (C1 != ConstOpnd) + return false; - if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) - RedoInsts.insert(T); - return true; - } - return false; + Value *X = Opnd1->getSymbolicPart(); + Res = createAndInstr(I, X, ~C1); + // ConstOpnd was C2, now C1 ^ C2. + ConstOpnd ^= C1; + + if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) + RedoInsts.insert(T); + return true; } @@ -1221,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3((~C1) ^ C2); // Do not increase code size! - if (C3 != 0 && !C3.isAllOnesValue()) { - int NewInstNum = ConstOpnd != 0 ? 1 : 2; + if (!C3.isNullValue() && !C3.isAllOnesValue()) { + int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } @@ -1238,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3 = C1 ^ C2; // Do not increase code size - if (C3 != 0 && !C3.isAllOnesValue()) { - int NewInstNum = ConstOpnd != 0 ? 1 : 2; + if (!C3.isNullValue() && !C3.isAllOnesValue()) { + int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } @@ -1279,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, SmallVector<XorOpnd, 8> Opnds; SmallVector<XorOpnd*, 8> OpndPtrs; Type *Ty = Ops[0].Op->getType(); - APInt ConstOpnd(Ty->getIntegerBitWidth(), 0); + APInt ConstOpnd(Ty->getScalarSizeInBits(), 0); // Step 1: Convert ValueEntry to XorOpnd for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *V = Ops[i].Op; - if (!isa<ConstantInt>(V)) { + const APInt *C; + // TODO: Support non-splat vectors. + if (match(V, PatternMatch::m_APInt(C))) { + ConstOpnd ^= *C; + } else { XorOpnd O(V); O.setSymbolicRank(getRank(O.getSymbolicPart())); Opnds.push_back(O); - } else - ConstOpnd ^= cast<ConstantInt>(V)->getValue(); + } } // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds". @@ -1327,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, Value *CV; // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" - if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { + if (!ConstOpnd.isNullValue() && + CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { Changed = true; if (CV) *CurrOpnd = XorOpnd(CV); @@ -1369,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, ValueEntry VE(getRank(O.getValue()), O.getValue()); Ops.push_back(VE); } - if (ConstOpnd != 0) { - Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd); + if (!ConstOpnd.isNullValue()) { + Value *C = ConstantInt::get(Ty, ConstOpnd); ValueEntry VE(getRank(C), C); Ops.push_back(VE); } - int Sz = Ops.size(); + unsigned Sz = Ops.size(); if (Sz == 1) return Ops.back().Op; - else if (Sz == 0) { - assert(ConstOpnd == 0); - return ConstantInt::get(Ty->getContext(), ConstOpnd); + if (Sz == 0) { + assert(ConstOpnd.isNullValue()); + return ConstantInt::get(Ty, ConstOpnd); } } @@ -1627,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, /// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] /// /// \returns Whether any factors have a power greater than one. -bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, - SmallVectorImpl<Factor> &Factors) { +static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. // Compute the sum of powers of simplifiable factors. unsigned FactorPowerSum = 0; @@ -1999,11 +2004,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) { if (I->isCommutative()) canonicalizeOperands(I); - // TODO: We should optimize vector Xor instructions, but they are - // currently unsupported. - if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) - return; - // Don't optimize floating point instructions that don't have unsafe algebra. if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index c6929c33b3e9..7a6fa1711411 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -536,9 +536,10 @@ private: void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } void visitInstruction(Instruction &I) { - // If a new instruction is added to LLVM that we don't handle. + // All the instructions we don't do any special handling for just + // go to overdefined. DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); - markOverdefined(&I); // Just in case + markOverdefined(&I); } }; @@ -1814,15 +1815,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (F.isDeclaration()) continue; - if (Solver.isBlockExecutable(&F.front())) { + if (Solver.isBlockExecutable(&F.front())) for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; - ++AI) { - if (AI->use_empty()) - continue; - if (tryToReplaceWithConstant(Solver, &*AI)) + ++AI) + if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) ++IPNumArgsElimed; - } - } for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (!Solver.isBlockExecutable(&*BB)) { diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 1527f15f18a3..80fbbeb6829b 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1252,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { if (!LI || !LI->isSimple()) return false; - // Both operands to the select need to be dereferencable, either + // Both operands to the select need to be dereferenceable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI)) @@ -1637,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return cast<PointerType>(NewTy)->getPointerAddressSpace() == cast<PointerType>(OldTy)->getPointerAddressSpace(); } - if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) - return true; + + // We can convert integers to integral pointers, but not to non-integral + // pointers. + if (OldTy->isIntegerTy()) + return !DL.isNonIntegralPointerType(NewTy); + + // We can convert integral pointers to integers, but non-integral pointers + // need to remain pointers. + if (!DL.isNonIntegralPointerType(OldTy)) + return NewTy->isIntegerTy(); + return false; } diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 850a01114eeb..ce6f93eb0c15 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -91,7 +91,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSeparateConstOffsetFromGEPPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReducePass(Registry); - initializeLoadCombinePass(Registry); initializePlaceBackedgeSafepointsImplPass(Registry); initializePlaceSafepointsPass(Registry); initializeFloat2IntLegacyPassPass(Registry); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 3e5993618c4c..9397b87cdf56 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -321,7 +321,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) { /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// -static bool canMoveAboveCall(Instruction *I, CallInst *CI) { +static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. @@ -332,10 +332,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) { if (CI->mayHaveSideEffects()) { // Non-volatile loads may be moved above a call with side effects if it // does not write to memory and the load provably won't trap. - // FIXME: Writes to memory only matter if they may alias the pointer + // Writes to memory only matter if they may alias the pointer // being loaded from. const DataLayout &DL = L->getModule()->getDataLayout(); - if (CI->mayWriteToMemory() || + if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getAlignment(), DL, L)) return false; @@ -492,10 +492,11 @@ static CallInst *findTRECandidate(Instruction *TI, return CI; } -static bool -eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs) { +static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVectorImpl<PHINode *> &ArgumentPHIs, + AliasAnalysis *AA) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial // value for the accumulator is placed in this variable. If this value is set @@ -515,7 +516,8 @@ eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, // Check that this is the case now. BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (canMoveAboveCall(&*BBI, CI)) continue; + if (canMoveAboveCall(&*BBI, CI, AA)) + continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed @@ -674,12 +676,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AliasAnalysis *AA) { bool Change = false; + // Make sure this block is a trivial return block. + assert(BB->getFirstNonPHIOrDbg() == Ret && + "Trying to fold non-trivial return block"); + // If the return block contains nothing but the return and PHI's, // there might be an opportunity to duplicate the return in its - // predecessors and perform TRC there. Look for predecessors that end + // predecessors and perform TRE there. Look for predecessors that end // in unconditional branch and recursive call(s). SmallVector<BranchInst*, 8> UncondBranchPreds; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { @@ -706,7 +713,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BB->eraseFromParent(); eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs); + ArgumentPHIs, AA); ++NumRetDuped; Change = true; } @@ -719,16 +726,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AliasAnalysis *AA) { CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI); if (!CI) return false; return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs); + ArgumentPHIs, AA); } -static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) { +static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI, + AliasAnalysis *AA) { if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") return false; @@ -763,11 +772,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall, TTI); + ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA); if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = - foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall, TTI); + Change = foldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall, TTI, AA); MadeChange |= Change; } } @@ -797,6 +806,7 @@ struct TailCallElim : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -805,7 +815,8 @@ struct TailCallElim : public FunctionPass { return false; return eliminateTailRecursion( - F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F)); + F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), + &getAnalysis<AAResultsWrapperPass>().getAAResults()); } }; } @@ -826,8 +837,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F, FunctionAnalysisManager &AM) { TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AliasAnalysis &AA = AM.getResult<AAManager>(F); - bool Changed = eliminateTailRecursion(F, &TTI); + bool Changed = eliminateTailRecursion(F, &TTI, &AA); if (!Changed) return PreservedAnalyses::all(); diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index ebde1f9a17dd..b60dfb4f3541 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -116,6 +116,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_wcslen: Changed |= setOnlyReadsMemory(F); Changed |= setDoesNotThrow(F); + Changed |= setOnlyAccessesArgMemory(F); Changed |= setDoesNotCapture(F, 0); return Changed; case LibFunc_strchr: diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 2af671636cbd..5127eba3f9ae 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -1081,7 +1082,7 @@ static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr, } /// See if there is a dbg.value intrinsic for DIVar for the PHI node. -static bool PhiHasDebugValue(DILocalVariable *DIVar, +static bool PhiHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr, PHINode *APN) { // Since we can't guarantee that the original dbg.declare instrinsic @@ -1159,7 +1160,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, DbgValue->insertAfter(LI); } -/// Inserts a llvm.dbg.value intrinsic after a phi +/// Inserts a llvm.dbg.value intrinsic after a phi /// that has an associated llvm.dbg.decl intrinsic. void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, PHINode *APN, DIBuilder &Builder) { @@ -1742,12 +1743,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, // Preserve !invariant.group in K. break; case LLVMContext::MD_align: - K->setMetadata(Kind, + K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; case LLVMContext::MD_dereferenceable: case LLVMContext::MD_dereferenceable_or_null: - K->setMetadata(Kind, + K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; } @@ -1847,6 +1848,49 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { return false; } +void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N, + LoadInst &NewLI) { + auto *NewTy = NewLI.getType(); + + // This only directly applies if the new type is also a pointer. + if (NewTy->isPointerTy()) { + NewLI.setMetadata(LLVMContext::MD_nonnull, N); + return; + } + + // The only other translation we can do is to integral loads with !range + // metadata. + if (!NewTy->isIntegerTy()) + return; + + MDBuilder MDB(NewLI.getContext()); + const Value *Ptr = OldLI.getPointerOperand(); + auto *ITy = cast<IntegerType>(NewTy); + auto *NullInt = ConstantExpr::getPtrToInt( + ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); + auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); + NewLI.setMetadata(LLVMContext::MD_range, + MDB.createRange(NonNullInt, NullInt)); +} + +void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, + MDNode *N, LoadInst &NewLI) { + auto *NewTy = NewLI.getType(); + + // Give up unless it is converted to a pointer where there is a single very + // valuable mapping we can do reliably. + // FIXME: It would be nice to propagate this in more ways, but the type + // conversions make it hard. + if (!NewTy->isPointerTy()) + return; + + unsigned BitWidth = DL.getTypeSizeInBits(NewTy); + if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { + MDNode *NN = MDNode::get(OldLI.getContext(), None); + NewLI.setMetadata(LLVMContext::MD_nonnull, NN); + } +} + namespace { /// A potential constituent of a bitreverse or bswap expression. See /// collectBitParts for a fuller explanation. @@ -1968,7 +2012,7 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, unsigned NumMaskedBits = AndMask.countPopulation(); if (!MatchBitReversals && NumMaskedBits % 8 != 0) return Result; - + auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, MatchBitReversals, BPS); if (!Res) diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index f3db278ef1e4..e21e34df8ded 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -72,7 +72,6 @@ using namespace llvm; #define DEBUG_TYPE "loop-simplify" -STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); STATISTIC(NumNested , "Number of nested loops split out"); // If the block isn't already, move the new block to right after some 'outside @@ -152,37 +151,6 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT, return PreheaderBB; } -/// \brief Ensure that the loop preheader dominates all exit blocks. -/// -/// This method is used to split exit blocks that have predecessors outside of -/// the loop. -static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, - DominatorTree *DT, LoopInfo *LI, - bool PreserveLCSSA) { - SmallVector<BasicBlock*, 8> LoopBlocks; - for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { - BasicBlock *P = *I; - if (L->contains(P)) { - // Don't do this if the loop is exited via an indirect branch. - if (isa<IndirectBrInst>(P->getTerminator())) return nullptr; - - LoopBlocks.push_back(P); - } - } - - assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); - BasicBlock *NewExitBB = nullptr; - - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI, - PreserveLCSSA); - if (!NewExitBB) - return nullptr; - - DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " - << NewExitBB->getName() << "\n"); - return NewExitBB; -} - /// Add the specified block, and all of its predecessors, to the specified set, /// if it's not already in there. Stop predecessor traversal when we reach /// StopBlock. @@ -346,16 +314,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, // Split edges to exit blocks from the inner loop, if they emerged in the // process of separating the outer one. - SmallVector<BasicBlock *, 8> ExitBlocks; - L->getExitBlocks(ExitBlocks); - SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - for (BasicBlock *ExitBlock : ExitBlockSet) { - if (any_of(predecessors(ExitBlock), - [L](BasicBlock *BB) { return !L->contains(BB); })) { - rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); - } - } + formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA); if (PreserveLCSSA) { // Fix LCSSA form for L. Some values, which previously were only used inside @@ -563,29 +522,16 @@ ReprocessLoop: BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); - if (Preheader) { - ++NumInserted; + if (Preheader) Changed = true; - } } // Next, check to make sure that all exit nodes of the loop only have // predecessors that are inside of the loop. This check guarantees that the // loop preheader/header will dominate the exit blocks. If the exit block has // predecessors from outside of the loop, split the edge now. - SmallVector<BasicBlock*, 8> ExitBlocks; - L->getExitBlocks(ExitBlocks); - - SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); - for (BasicBlock *ExitBlock : ExitBlockSet) { - if (any_of(predecessors(ExitBlock), - [L](BasicBlock *BB) { return !L->contains(BB); })) { - rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); - ++NumInserted; - Changed = true; - } - } + if (formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA)) + Changed = true; // If the header has more than two predecessors at this point (from the // preheader and from multiple backedges), we must adjust the loop. @@ -614,10 +560,8 @@ ReprocessLoop: // insert a new block that all backedges target, then make it jump to the // loop header. LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI); - if (LoopLatch) { - ++NumInserted; + if (LoopLatch) Changed = true; - } } const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); @@ -645,7 +589,22 @@ ReprocessLoop: // loop-invariant instructions out of the way to open up more // opportunities, and the disadvantage of having the responsibility // to preserve dominator information. - if (ExitBlockSet.size() == 1) { + auto HasUniqueExitBlock = [&]() { + BasicBlock *UniqueExit = nullptr; + for (auto *ExitingBB : ExitingBlocks) + for (auto *SuccBB : successors(ExitingBB)) { + if (L->contains(SuccBB)) + continue; + + if (!UniqueExit) + UniqueExit = SuccBB; + else if (UniqueExit != SuccBB) + return false; + } + + return true; + }; + if (HasUniqueExitBlock()) { for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { BasicBlock *ExitingBlock = ExitingBlocks[i]; if (!ExitingBlock->getSinglePredecessor()) continue; diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a920cd86a26a..5f85e17927fa 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -472,10 +472,22 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // exit block only. if (!L->isLoopSimplifyForm()) return false; - BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop - if (!Exit) - return false; + // Guaranteed by LoopSimplifyForm. + BasicBlock *Latch = L->getLoopLatch(); + + BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop + if (!LatchExit) + return false; + // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the + // targets of the Latch be the single exit block out of the loop. This needs + // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. + BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); + assert((LatchBR->getSuccessor(0) == LatchExit || + LatchBR->getSuccessor(1) == LatchExit) && + "one of the loop latch successors should be " + "the exit block!"); + (void)LatchBR; // Use Scalar Evolution to compute the trip count. This allows more loops to // be unrolled than relying on induction var simplification. if (!SE) @@ -510,25 +522,13 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, if (Log2_32(Count) > BEWidth) return false; - BasicBlock *Latch = L->getLoopLatch(); - - // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the - // targets of the Latch be the single exit block out of the loop. This needs - // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. - BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); - assert( - (LatchBR->getSuccessor(0) == Exit || LatchBR->getSuccessor(1) == Exit) && - "one of the loop latch successors should be " - "the exit block!"); - // Avoid warning of unused `LatchBR` variable in release builds. - (void)LatchBR; // Loop structure is the following: // // PreHeader // Header // ... // Latch - // Exit + // LatchExit BasicBlock *NewPreHeader; BasicBlock *NewExit = nullptr; @@ -541,9 +541,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // Split PreHeader to insert a branch around loop for unrolling. NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI); NewPreHeader->setName(PreHeader->getName() + ".new"); - // Split Exit to create phi nodes from branch above. - SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); - NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", + // Split LatchExit to create phi nodes from branch above. + SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit)); + NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI, PreserveLCSSA); // Split NewExit to insert epilog remainder loop. EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI); @@ -570,7 +570,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // Latch Header // *NewExit ... // *EpilogPreHeader Latch - // Exit Exit + // LatchExit LatchExit // Calculate conditions for branch around loop for unrolling // in epilog case and around prolog remainder loop in prolog case. @@ -648,7 +648,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. - BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit; + BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); @@ -672,7 +672,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, // EpilogHeader Header // ... ... // EpilogLatch Latch - // Exit Exit + // LatchExit LatchExit // Rewrite the cloned instruction operands to use the values created when the // clone is created. @@ -686,7 +686,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, if (UseEpilogRemainder) { // Connect the epilog code to the original loop and update the // PHI functions. - ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader, + ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader, NewPreHeader, VMap, DT, LI, PreserveLCSSA); diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 412f6129407e..0ed33945ef40 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -29,6 +30,7 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -87,8 +89,7 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT // with a new integer type of the corresponding bit width. - if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)), - m_And(m_APInt(M), m_Instruction(I))))) { + if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) { int32_t Bits = (*M + 1).exactLogBase2(); if (Bits > 0) { RT = IntegerType::get(Phi->getContext(), Bits); @@ -923,6 +924,69 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, return true; } +bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { + bool Changed = false; + + // We re-use a vector for the in-loop predecesosrs. + SmallVector<BasicBlock *, 4> InLoopPredecessors; + + auto RewriteExit = [&](BasicBlock *BB) { + assert(InLoopPredecessors.empty() && + "Must start with an empty predecessors list!"); + auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); }); + + // See if there are any non-loop predecessors of this exit block and + // keep track of the in-loop predecessors. + bool IsDedicatedExit = true; + for (auto *PredBB : predecessors(BB)) + if (L->contains(PredBB)) { + if (isa<IndirectBrInst>(PredBB->getTerminator())) + // We cannot rewrite exiting edges from an indirectbr. + return false; + + InLoopPredecessors.push_back(PredBB); + } else { + IsDedicatedExit = false; + } + + assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!"); + + // Nothing to do if this is already a dedicated exit. + if (IsDedicatedExit) + return false; + + auto *NewExitBB = SplitBlockPredecessors( + BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA); + + if (!NewExitBB) + DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: " + << *L << "\n"); + else + DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " + << NewExitBB->getName() << "\n"); + return true; + }; + + // Walk the exit blocks directly rather than building up a data structure for + // them, but only visit each one once. + SmallPtrSet<BasicBlock *, 4> Visited; + for (auto *BB : L->blocks()) + for (auto *SuccBB : successors(BB)) { + // We're looking for exit blocks so skip in-loop successors. + if (L->contains(SuccBB)) + continue; + + // Visit each exit block exactly once. + if (!Visited.insert(SuccBB).second) + continue; + + Changed |= RewriteExit(SuccBB); + } + + return Changed; +} + /// \brief Returns the instructions that use values defined in the loop. SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) { SmallVector<Instruction *, 8> UsedOutside; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1abdb2484850..eac2867233bc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5702,14 +5702,14 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // We should not collect Uniforms more than once per VF. Right now, - // this function is called from collectUniformsAndScalars(), which + // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. assert(VF >= 2 && !Uniforms.count(VF) && "This function should not be visited twice for the same VF"); - // Visit the list of Uniforms. If we'll not find any uniform value, we'll + // Visit the list of Uniforms. If we'll not find any uniform value, we'll // not analyze again. Uniforms.count(VF) will return 1. Uniforms[VF].clear(); @@ -5988,10 +5988,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses( continue; Value *Ptr = getPointerOperand(&I); - // We don't check wrapping here because we don't know yet if Ptr will be - // part of a full group or a group with gaps. Checking wrapping for all + // We don't check wrapping here because we don't know yet if Ptr will be + // part of a full group or a group with gaps. Checking wrapping for all // pointers (even those that end up in groups with no gaps) will be overly - // conservative. For full groups, wrapping should be ok since if we would + // conservative. For full groups, wrapping should be ok since if we would // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. @@ -6244,7 +6244,7 @@ void InterleavedAccessInfo::analyzeInterleaving( Instruction *LastMember = Group->getMember(Group->getFactor() - 1); if (LastMember) { Value *LastMemberPtr = getPointerOperand(LastMember); - if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, + if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " "last group member potentially pointer-wrapping.\n"); @@ -6252,9 +6252,9 @@ void InterleavedAccessInfo::analyzeInterleaving( } } else { // Case 3: A non-reversed interleaved load group with gaps: We need - // to execute at least one scalar epilogue iteration. This will ensure + // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need - // to look for a member at index factor - 1, since every group must have + // to look for a member at index factor - 1, since every group must have // a member at index zero. if (Group->isReverse()) { releaseGroup(Group); @@ -7789,8 +7789,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L); - if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) { + unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); + bool HasExpectedTC = (ExpectedTC > 0); + + if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { + auto EstimatedTC = getLoopEstimatedTripCount(L); + if (EstimatedTC) { + ExpectedTC = *EstimatedTC; + HasExpectedTC = true; + } + } + + if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) @@ -7822,18 +7832,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); - // Compute the weighted frequency of this loop being executed and see if it - // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectorize. - // FIXME: This is hidden behind a flag due to pervasive problems with - // exactly what block frequency models. - if (LoopVectorizeWithBlockFrequency) { - BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); - if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - LoopEntryFreq < ColdEntryFreq) - OptForSize = true; - } - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer @@ -8015,11 +8013,6 @@ bool LoopVectorizePass::runImpl( DB = &DB_; ORE = &ORE_; - // Compute some weights outside of the loop over the loops. Compute this - // using a BranchProbability to re-use its scaling math. - const BranchProbability ColdProb(1, 5); // 20% - ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; - // Don't attempt if // 1. the target claims to have no vector registers, and // 2. interleaving won't help ILP. diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index d1349535f298..b267230d3185 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3327,12 +3327,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) { + if (!DestBundle->IsScheduled) BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { + if (!DestBundle->hasValidDependencies()) WorkList.push_back(DestBundle); - } } } else { // I'm not sure if this can ever happen. But we need to be safe. |