diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
| commit | 145449b1e420787bb99721a429341fa6be3adfb6 (patch) | |
| tree | 1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Transforms/Scalar | |
| parent | ecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff) | |
Diffstat (limited to 'llvm/lib/Transforms/Scalar')
76 files changed, 3405 insertions, 3964 deletions
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp index 1cda206a7e14..cdf9de8d78d5 100644 --- a/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -35,7 +35,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index e4ec5f266eb8..9571e99dfb19 100644 --- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -15,8 +15,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Instructions.h" -#include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -26,12 +24,11 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Constant.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" diff --git a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp index a5e65ffc45fe..155f47b49357 100644 --- a/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp +++ b/llvm/lib/Transforms/Scalar/AnnotationRemarks.cpp @@ -16,11 +16,8 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/MemoryOpRemark.h" diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index 95de59fa8262..cc12033fb677 100644 --- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -57,6 +57,7 @@ #include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" @@ -65,7 +66,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" @@ -123,8 +123,8 @@ static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) { return false; } -typedef std::pair<ICmpInst *, unsigned> ConditionTy; -typedef SmallVector<ConditionTy, 2> ConditionsTy; +using ConditionTy = std::pair<ICmpInst *, unsigned>; +using ConditionsTy = SmallVector<ConditionTy, 2>; /// If From has a conditional jump to To, add the condition to Conditions, /// if it is relevant to any argument at CB. @@ -301,10 +301,9 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, /// Note that in case any arguments at the call-site are constrained by its /// predecessors, new call-sites with more constrained arguments will be /// created in createCallSitesOnPredicatedArgument(). -static void splitCallSite( - CallBase &CB, - const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds, - DomTreeUpdater &DTU) { +static void splitCallSite(CallBase &CB, + ArrayRef<std::pair<BasicBlock *, ConditionsTy>> Preds, + DomTreeUpdater &DTU) { BasicBlock *TailBB = CB.getParent(); bool IsMustTailCall = CB.isMustTailCall(); diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 25e8c3ef3b48..8a1761505d59 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -52,6 +52,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 13963657d183..6dfa2440023f 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -19,15 +19,16 @@ #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Scalar.h" #include <string> @@ -42,48 +43,129 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated", "Controls which conditions are eliminated"); static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max(); +static int64_t MinSignedConstraintValue = std::numeric_limits<int64_t>::min(); namespace { -struct ConstraintTy { - SmallVector<int64_t, 8> Coefficients; - ConstraintTy(SmallVector<int64_t, 8> Coefficients) - : Coefficients(Coefficients) {} +class ConstraintInfo; - unsigned size() const { return Coefficients.size(); } +struct StackEntry { + unsigned NumIn; + unsigned NumOut; + bool IsNot; + bool IsSigned = false; + /// Variables that can be removed from the system once the stack entry gets + /// removed. + SmallVector<Value *, 2> ValuesToRelease; + + StackEntry(unsigned NumIn, unsigned NumOut, bool IsNot, bool IsSigned, + SmallVector<Value *, 2> ValuesToRelease) + : NumIn(NumIn), NumOut(NumOut), IsNot(IsNot), IsSigned(IsSigned), + ValuesToRelease(ValuesToRelease) {} }; -/// Struct to manage a list of constraints. -struct ConstraintListTy { - SmallVector<ConstraintTy, 4> Constraints; +/// Struct to express a pre-condition of the form %Op0 Pred %Op1. +struct PreconditionTy { + CmpInst::Predicate Pred; + Value *Op0; + Value *Op1; - ConstraintListTy() {} + PreconditionTy(CmpInst::Predicate Pred, Value *Op0, Value *Op1) + : Pred(Pred), Op0(Op0), Op1(Op1) {} +}; - ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints) - : Constraints(Constraints) {} +struct ConstraintTy { + SmallVector<int64_t, 8> Coefficients; + SmallVector<PreconditionTy, 2> Preconditions; - void mergeIn(const ConstraintListTy &Other) { - append_range(Constraints, Other.Constraints); - } + bool IsSigned = false; + bool IsEq = false; + + ConstraintTy() = default; - unsigned size() const { return Constraints.size(); } + ConstraintTy(SmallVector<int64_t, 8> Coefficients, bool IsSigned) + : Coefficients(Coefficients), IsSigned(IsSigned) {} - unsigned empty() const { return Constraints.empty(); } + unsigned size() const { return Coefficients.size(); } + + unsigned empty() const { return Coefficients.empty(); } /// Returns true if any constraint has a non-zero coefficient for any of the /// newly added indices. Zero coefficients for new indices are removed. If it /// returns true, no new variable need to be added to the system. bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) { - assert(size() == 1); for (unsigned I = 0; I < NewIndices.size(); ++I) { - int64_t Last = get(0).Coefficients.pop_back_val(); + int64_t Last = Coefficients.pop_back_val(); if (Last != 0) return true; } return false; } - ConstraintTy &get(unsigned I) { return Constraints[I]; } + /// Returns true if all preconditions for this list of constraints are + /// satisfied given \p CS and the corresponding \p Value2Index mapping. + bool isValid(const ConstraintInfo &Info) const; +}; + +/// Wrapper encapsulating separate constraint systems and corresponding value +/// mappings for both unsigned and signed information. Facts are added to and +/// conditions are checked against the corresponding system depending on the +/// signed-ness of their predicates. While the information is kept separate +/// based on signed-ness, certain conditions can be transferred between the two +/// systems. +class ConstraintInfo { + DenseMap<Value *, unsigned> UnsignedValue2Index; + DenseMap<Value *, unsigned> SignedValue2Index; + + ConstraintSystem UnsignedCS; + ConstraintSystem SignedCS; + +public: + DenseMap<Value *, unsigned> &getValue2Index(bool Signed) { + return Signed ? SignedValue2Index : UnsignedValue2Index; + } + const DenseMap<Value *, unsigned> &getValue2Index(bool Signed) const { + return Signed ? SignedValue2Index : UnsignedValue2Index; + } + + ConstraintSystem &getCS(bool Signed) { + return Signed ? SignedCS : UnsignedCS; + } + const ConstraintSystem &getCS(bool Signed) const { + return Signed ? SignedCS : UnsignedCS; + } + + void popLastConstraint(bool Signed) { getCS(Signed).popLastConstraint(); } + void popLastNVariables(bool Signed, unsigned N) { + getCS(Signed).popLastNVariables(N); + } + + bool doesHold(CmpInst::Predicate Pred, Value *A, Value *B) const; + + void addFact(CmpInst::Predicate Pred, Value *A, Value *B, bool IsNegated, + unsigned NumIn, unsigned NumOut, + SmallVectorImpl<StackEntry> &DFSInStack); + + /// Turn a comparison of the form \p Op0 \p Pred \p Op1 into a vector of + /// constraints, using indices from the corresponding constraint system. + /// Additional indices for newly discovered values are added to \p NewIndices. + ConstraintTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, + DenseMap<Value *, unsigned> &NewIndices) const; + + /// Turn a condition \p CmpI into a vector of constraints, using indices from + /// the corresponding constraint system. Additional indices for newly + /// discovered values are added to \p NewIndices. + ConstraintTy getConstraint(CmpInst *Cmp, + DenseMap<Value *, unsigned> &NewIndices) const { + return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0), + Cmp->getOperand(1), NewIndices); + } + + /// Try to add information from \p A \p Pred \p B to the unsigned/signed + /// system if \p Pred is signed/unsigned. + void transferToOtherSystem(CmpInst::Predicate Pred, Value *A, Value *B, + bool IsNegated, unsigned NumIn, unsigned NumOut, + SmallVectorImpl<StackEntry> &DFSInStack); }; } // namespace @@ -92,11 +174,28 @@ struct ConstraintListTy { // sum of the pairs equals \p V. The first pair is the constant-factor and X // must be nullptr. If the expression cannot be decomposed, returns an empty // vector. -static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { +static SmallVector<std::pair<int64_t, Value *>, 4> +decompose(Value *V, SmallVector<PreconditionTy, 4> &Preconditions, + bool IsSigned) { + + auto CanUseSExt = [](ConstantInt *CI) { + const APInt &Val = CI->getValue(); + return Val.sgt(MinSignedConstraintValue) && Val.slt(MaxConstraintValue); + }; + // Decompose \p V used with a signed predicate. + if (IsSigned) { + if (auto *CI = dyn_cast<ConstantInt>(V)) { + if (CanUseSExt(CI)) + return {{CI->getSExtValue(), nullptr}}; + } + + return {{0, nullptr}, {1, V}}; + } + if (auto *CI = dyn_cast<ConstantInt>(V)) { - if (CI->isNegative() || CI->uge(MaxConstraintValue)) + if (CI->uge(MaxConstraintValue)) return {}; - return {{CI->getSExtValue(), nullptr}}; + return {{CI->getZExtValue(), nullptr}}; } auto *GEP = dyn_cast<GetElementPtrInst>(V); if (GEP && GEP->getNumOperands() == 2 && GEP->isInBounds()) { @@ -106,11 +205,13 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { // If the index is zero-extended, it is guaranteed to be positive. if (match(GEP->getOperand(GEP->getNumOperands() - 1), m_ZExt(m_Value(Op0)))) { - if (match(Op0, m_NUWShl(m_Value(Op1), m_ConstantInt(CI)))) + if (match(Op0, m_NUWShl(m_Value(Op1), m_ConstantInt(CI))) && + CanUseSExt(CI)) return {{0, nullptr}, {1, GEP->getPointerOperand()}, {std::pow(int64_t(2), CI->getSExtValue()), Op1}}; - if (match(Op0, m_NSWAdd(m_Value(Op1), m_ConstantInt(CI)))) + if (match(Op0, m_NSWAdd(m_Value(Op1), m_ConstantInt(CI))) && + CanUseSExt(CI)) return {{CI->getSExtValue(), nullptr}, {1, GEP->getPointerOperand()}, {1, Op1}}; @@ -118,17 +219,19 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { } if (match(GEP->getOperand(GEP->getNumOperands() - 1), m_ConstantInt(CI)) && - !CI->isNegative()) + !CI->isNegative() && CanUseSExt(CI)) return {{CI->getSExtValue(), nullptr}, {1, GEP->getPointerOperand()}}; SmallVector<std::pair<int64_t, Value *>, 4> Result; if (match(GEP->getOperand(GEP->getNumOperands() - 1), - m_NUWShl(m_Value(Op0), m_ConstantInt(CI)))) + m_NUWShl(m_Value(Op0), m_ConstantInt(CI))) && + CanUseSExt(CI)) Result = {{0, nullptr}, {1, GEP->getPointerOperand()}, {std::pow(int64_t(2), CI->getSExtValue()), Op0}}; else if (match(GEP->getOperand(GEP->getNumOperands() - 1), - m_NSWAdd(m_Value(Op0), m_ConstantInt(CI)))) + m_NSWAdd(m_Value(Op0), m_ConstantInt(CI))) && + CanUseSExt(CI)) Result = {{CI->getSExtValue(), nullptr}, {1, GEP->getPointerOperand()}, {1, Op0}}; @@ -136,6 +239,10 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { Op0 = GEP->getOperand(GEP->getNumOperands() - 1); Result = {{0, nullptr}, {1, GEP->getPointerOperand()}, {1, Op0}}; } + // If Op0 is signed non-negative, the GEP is increasing monotonically and + // can be de-composed. + Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, + ConstantInt::get(Op0->getType(), 0)); return Result; } @@ -145,12 +252,20 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { Value *Op1; ConstantInt *CI; - if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI)))) + if (match(V, m_NUWAdd(m_Value(Op0), m_ConstantInt(CI))) && + !CI->uge(MaxConstraintValue)) + return {{CI->getZExtValue(), nullptr}, {1, Op0}}; + if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && + CanUseSExt(CI)) { + Preconditions.emplace_back( + CmpInst::ICMP_UGE, Op0, + ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); return {{CI->getSExtValue(), nullptr}, {1, Op0}}; + } if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1)))) return {{0, nullptr}, {1, Op0}, {1, Op1}}; - if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI)))) + if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))) && CanUseSExt(CI)) return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}}; if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1)))) return {{0, nullptr}, {1, Op0}, {-1, Op1}}; @@ -158,73 +273,73 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { return {{0, nullptr}, {1, V}}; } -/// Turn a condition \p CmpI into a vector of constraints, using indices from \p -/// Value2Index. Additional indices for newly discovered values are added to \p -/// NewIndices. -static ConstraintListTy -getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, - const DenseMap<Value *, unsigned> &Value2Index, - DenseMap<Value *, unsigned> &NewIndices) { - int64_t Offset1 = 0; - int64_t Offset2 = 0; - - // First try to look up \p V in Value2Index and NewIndices. Otherwise add a - // new entry to NewIndices. - auto GetOrAddIndex = [&Value2Index, &NewIndices](Value *V) -> unsigned { - auto V2I = Value2Index.find(V); - if (V2I != Value2Index.end()) - return V2I->second; - auto NewI = NewIndices.find(V); - if (NewI != NewIndices.end()) - return NewI->second; - auto Insert = - NewIndices.insert({V, Value2Index.size() + NewIndices.size() + 1}); - return Insert.first->second; - }; - - if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE) - return getConstraint(CmpInst::getSwappedPredicate(Pred), Op1, Op0, - Value2Index, NewIndices); - - if (Pred == CmpInst::ICMP_EQ) { - if (match(Op1, m_Zero())) - return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, - NewIndices); - - auto A = - getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices); - auto B = - getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices); - A.mergeIn(B); - return A; +ConstraintTy +ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, + DenseMap<Value *, unsigned> &NewIndices) const { + bool IsEq = false; + // Try to convert Pred to one of ULE/SLT/SLE/SLT. + switch (Pred) { + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SGE: { + Pred = CmpInst::getSwappedPredicate(Pred); + std::swap(Op0, Op1); + break; } - - if (Pred == CmpInst::ICMP_NE && match(Op1, m_Zero())) { - return getConstraint(CmpInst::ICMP_UGT, Op0, Op1, Value2Index, NewIndices); + case CmpInst::ICMP_EQ: + if (match(Op1, m_Zero())) { + Pred = CmpInst::ICMP_ULE; + } else { + IsEq = true; + Pred = CmpInst::ICMP_ULE; + } + break; + case CmpInst::ICMP_NE: + if (!match(Op1, m_Zero())) + return {}; + Pred = CmpInst::getSwappedPredicate(CmpInst::ICMP_UGT); + std::swap(Op0, Op1); + break; + default: + break; } // Only ULE and ULT predicates are supported at the moment. - if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT) + if (Pred != CmpInst::ICMP_ULE && Pred != CmpInst::ICMP_ULT && + Pred != CmpInst::ICMP_SLE && Pred != CmpInst::ICMP_SLT) return {}; - auto ADec = decompose(Op0->stripPointerCastsSameRepresentation()); - auto BDec = decompose(Op1->stripPointerCastsSameRepresentation()); + SmallVector<PreconditionTy, 4> Preconditions; + bool IsSigned = CmpInst::isSigned(Pred); + auto &Value2Index = getValue2Index(IsSigned); + auto ADec = decompose(Op0->stripPointerCastsSameRepresentation(), + Preconditions, IsSigned); + auto BDec = decompose(Op1->stripPointerCastsSameRepresentation(), + Preconditions, IsSigned); // Skip if decomposing either of the values failed. if (ADec.empty() || BDec.empty()) return {}; - // Skip trivial constraints without any variables. - if (ADec.size() == 1 && BDec.size() == 1) - return {}; - - Offset1 = ADec[0].first; - Offset2 = BDec[0].first; + int64_t Offset1 = ADec[0].first; + int64_t Offset2 = BDec[0].first; Offset1 *= -1; // Create iterator ranges that skip the constant-factor. auto VariablesA = llvm::drop_begin(ADec); auto VariablesB = llvm::drop_begin(BDec); + // First try to look up \p V in Value2Index and NewIndices. Otherwise add a + // new entry to NewIndices. + auto GetOrAddIndex = [&Value2Index, &NewIndices](Value *V) -> unsigned { + auto V2I = Value2Index.find(V); + if (V2I != Value2Index.end()) + return V2I->second; + auto Insert = + NewIndices.insert({V, Value2Index.size() + NewIndices.size() + 1}); + return Insert.first->second; + }; + // Make sure all variables have entries in Value2Index or NewIndices. for (const auto &KV : concat<std::pair<int64_t, Value *>>(VariablesA, VariablesB)) @@ -232,22 +347,85 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, // Build result constraint, by first adding all coefficients from A and then // subtracting all coefficients from B. - SmallVector<int64_t, 8> R(Value2Index.size() + NewIndices.size() + 1, 0); + ConstraintTy Res( + SmallVector<int64_t, 8>(Value2Index.size() + NewIndices.size() + 1, 0), + IsSigned); + Res.IsEq = IsEq; + auto &R = Res.Coefficients; for (const auto &KV : VariablesA) R[GetOrAddIndex(KV.second)] += KV.first; for (const auto &KV : VariablesB) R[GetOrAddIndex(KV.second)] -= KV.first; - R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0); - return {{R}}; + int64_t OffsetSum; + if (AddOverflow(Offset1, Offset2, OffsetSum)) + return {}; + if (Pred == (IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT)) + if (AddOverflow(OffsetSum, int64_t(-1), OffsetSum)) + return {}; + R[0] = OffsetSum; + Res.Preconditions = std::move(Preconditions); + return Res; +} + +bool ConstraintTy::isValid(const ConstraintInfo &Info) const { + return Coefficients.size() > 0 && + all_of(Preconditions, [&Info](const PreconditionTy &C) { + return Info.doesHold(C.Pred, C.Op0, C.Op1); + }); +} + +bool ConstraintInfo::doesHold(CmpInst::Predicate Pred, Value *A, + Value *B) const { + DenseMap<Value *, unsigned> NewIndices; + auto R = getConstraint(Pred, A, B, NewIndices); + + if (!NewIndices.empty()) + return false; + + // TODO: properly check NewIndices. + return NewIndices.empty() && R.Preconditions.empty() && !R.IsEq && + !R.empty() && + getCS(CmpInst::isSigned(Pred)).isConditionImplied(R.Coefficients); } -static ConstraintListTy -getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index, - DenseMap<Value *, unsigned> &NewIndices) { - return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0), - Cmp->getOperand(1), Value2Index, NewIndices); +void ConstraintInfo::transferToOtherSystem( + CmpInst::Predicate Pred, Value *A, Value *B, bool IsNegated, unsigned NumIn, + unsigned NumOut, SmallVectorImpl<StackEntry> &DFSInStack) { + // Check if we can combine facts from the signed and unsigned systems to + // derive additional facts. + if (!A->getType()->isIntegerTy()) + return; + // FIXME: This currently depends on the order we add facts. Ideally we + // would first add all known facts and only then try to add additional + // facts. + switch (Pred) { + default: + break; + case CmpInst::ICMP_ULT: + // If B is a signed positive constant, A >=s 0 and A <s B. + if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0))) { + addFact(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0), + IsNegated, NumIn, NumOut, DFSInStack); + addFact(CmpInst::ICMP_SLT, A, B, IsNegated, NumIn, NumOut, DFSInStack); + } + break; + case CmpInst::ICMP_SLT: + if (doesHold(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0))) + addFact(CmpInst::ICMP_ULT, A, B, IsNegated, NumIn, NumOut, DFSInStack); + break; + case CmpInst::ICMP_SGT: + if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), -1))) + addFact(CmpInst::ICMP_UGE, A, ConstantInt::get(B->getType(), 0), + IsNegated, NumIn, NumOut, DFSInStack); + break; + case CmpInst::ICMP_SGE: + if (doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0))) { + addFact(CmpInst::ICMP_UGE, A, B, IsNegated, NumIn, NumOut, DFSInStack); + } + break; + } } namespace { @@ -271,134 +449,253 @@ struct ConstraintOrBlock { Not(Not), Condition(Condition) {} }; -struct StackEntry { - unsigned NumIn; - unsigned NumOut; - CmpInst *Condition; - bool IsNot; +/// Keep state required to build worklist. +struct State { + DominatorTree &DT; + SmallVector<ConstraintOrBlock, 64> WorkList; - StackEntry(unsigned NumIn, unsigned NumOut, CmpInst *Condition, bool IsNot) - : NumIn(NumIn), NumOut(NumOut), Condition(Condition), IsNot(IsNot) {} + State(DominatorTree &DT) : DT(DT) {} + + /// Process block \p BB and add known facts to work-list. + void addInfoFor(BasicBlock &BB); + + /// Returns true if we can add a known condition from BB to its successor + /// block Succ. Each predecessor of Succ can either be BB or be dominated + /// by Succ (e.g. the case when adding a condition from a pre-header to a + /// loop header). + bool canAddSuccessor(BasicBlock &BB, BasicBlock *Succ) const { + if (BB.getSingleSuccessor()) { + assert(BB.getSingleSuccessor() == Succ); + return DT.properlyDominates(&BB, Succ); + } + return any_of(successors(&BB), + [Succ](const BasicBlock *S) { return S != Succ; }) && + all_of(predecessors(Succ), [&BB, Succ, this](BasicBlock *Pred) { + return Pred == &BB || DT.dominates(Succ, Pred); + }); + } }; + } // namespace #ifndef NDEBUG -static void dumpWithNames(ConstraintTy &C, +static void dumpWithNames(const ConstraintSystem &CS, DenseMap<Value *, unsigned> &Value2Index) { SmallVector<std::string> Names(Value2Index.size(), ""); for (auto &KV : Value2Index) { Names[KV.second - 1] = std::string("%") + KV.first->getName().str(); } - ConstraintSystem CS; - CS.addVariableRowFill(C.Coefficients); CS.dump(Names); } -#endif -static bool eliminateConstraints(Function &F, DominatorTree &DT) { - bool Changed = false; - DT.updateDFSNumbers(); +static void dumpWithNames(ArrayRef<int64_t> C, + DenseMap<Value *, unsigned> &Value2Index) { ConstraintSystem CS; + CS.addVariableRowFill(C); + dumpWithNames(CS, Value2Index); +} +#endif - SmallVector<ConstraintOrBlock, 64> WorkList; +void State::addInfoFor(BasicBlock &BB) { + WorkList.emplace_back(DT.getNode(&BB)); - // First, collect conditions implied by branches and blocks with their - // Dominator DFS in and out numbers. - for (BasicBlock &BB : F) { - if (!DT.getNode(&BB)) - continue; - WorkList.emplace_back(DT.getNode(&BB)); - - // True as long as long as the current instruction is guaranteed to execute. - bool GuaranteedToExecute = true; - // Scan BB for assume calls. - // TODO: also use this scan to queue conditions to simplify, so we can - // interleave facts from assumes and conditions to simplify in a single - // basic block. And to skip another traversal of each basic block when - // simplifying. - for (Instruction &I : BB) { - Value *Cond; - // For now, just handle assumes with a single compare as condition. - if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) && - isa<CmpInst>(Cond)) { - if (GuaranteedToExecute) { - // The assume is guaranteed to execute when BB is entered, hence Cond - // holds on entry to BB. - WorkList.emplace_back(DT.getNode(&BB), cast<CmpInst>(Cond), false); - } else { - // Otherwise the condition only holds in the successors. - for (BasicBlock *Succ : successors(&BB)) - WorkList.emplace_back(DT.getNode(Succ), cast<CmpInst>(Cond), false); + // True as long as long as the current instruction is guaranteed to execute. + bool GuaranteedToExecute = true; + // Scan BB for assume calls. + // TODO: also use this scan to queue conditions to simplify, so we can + // interleave facts from assumes and conditions to simplify in a single + // basic block. And to skip another traversal of each basic block when + // simplifying. + for (Instruction &I : BB) { + Value *Cond; + // For now, just handle assumes with a single compare as condition. + if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) && + isa<ICmpInst>(Cond)) { + if (GuaranteedToExecute) { + // The assume is guaranteed to execute when BB is entered, hence Cond + // holds on entry to BB. + WorkList.emplace_back(DT.getNode(&BB), cast<ICmpInst>(Cond), false); + } else { + // Otherwise the condition only holds in the successors. + for (BasicBlock *Succ : successors(&BB)) { + if (!canAddSuccessor(BB, Succ)) + continue; + WorkList.emplace_back(DT.getNode(Succ), cast<ICmpInst>(Cond), false); } } - GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I); } + GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I); + } - auto *Br = dyn_cast<BranchInst>(BB.getTerminator()); - if (!Br || !Br->isConditional()) - continue; + auto *Br = dyn_cast<BranchInst>(BB.getTerminator()); + if (!Br || !Br->isConditional()) + return; - // Returns true if we can add a known condition from BB to its successor - // block Succ. Each predecessor of Succ can either be BB or be dominated by - // Succ (e.g. the case when adding a condition from a pre-header to a loop - // header). - auto CanAdd = [&BB, &DT](BasicBlock *Succ) { - return all_of(predecessors(Succ), [&BB, &DT, Succ](BasicBlock *Pred) { - return Pred == &BB || DT.dominates(Succ, Pred); - }); - }; - // If the condition is an OR of 2 compares and the false successor only has - // the current block as predecessor, queue both negated conditions for the - // false successor. - Value *Op0, *Op1; - if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) && - match(Op0, m_Cmp()) && match(Op1, m_Cmp())) { - BasicBlock *FalseSuccessor = Br->getSuccessor(1); - if (CanAdd(FalseSuccessor)) { - WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op0), - true); - WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<CmpInst>(Op1), - true); - } - continue; + // If the condition is an OR of 2 compares and the false successor only has + // the current block as predecessor, queue both negated conditions for the + // false successor. + Value *Op0, *Op1; + if (match(Br->getCondition(), m_LogicalOr(m_Value(Op0), m_Value(Op1))) && + isa<ICmpInst>(Op0) && isa<ICmpInst>(Op1)) { + BasicBlock *FalseSuccessor = Br->getSuccessor(1); + if (canAddSuccessor(BB, FalseSuccessor)) { + WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<ICmpInst>(Op0), + true); + WorkList.emplace_back(DT.getNode(FalseSuccessor), cast<ICmpInst>(Op1), + true); + } + return; + } + + // If the condition is an AND of 2 compares and the true successor only has + // the current block as predecessor, queue both conditions for the true + // successor. + if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) && + isa<ICmpInst>(Op0) && isa<ICmpInst>(Op1)) { + BasicBlock *TrueSuccessor = Br->getSuccessor(0); + if (canAddSuccessor(BB, TrueSuccessor)) { + WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<ICmpInst>(Op0), + false); + WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<ICmpInst>(Op1), + false); + } + return; + } + + auto *CmpI = dyn_cast<ICmpInst>(Br->getCondition()); + if (!CmpI) + return; + if (canAddSuccessor(BB, Br->getSuccessor(0))) + WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false); + if (canAddSuccessor(BB, Br->getSuccessor(1))) + WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true); +} + +void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B, + bool IsNegated, unsigned NumIn, unsigned NumOut, + SmallVectorImpl<StackEntry> &DFSInStack) { + // If the constraint has a pre-condition, skip the constraint if it does not + // hold. + DenseMap<Value *, unsigned> NewIndices; + auto R = getConstraint(Pred, A, B, NewIndices); + if (!R.isValid(*this)) + return; + + //LLVM_DEBUG(dbgs() << "Adding " << *Condition << " " << IsNegated << "\n"); + bool Added = false; + assert(CmpInst::isSigned(Pred) == R.IsSigned && + "condition and constraint signs must match"); + auto &CSToUse = getCS(R.IsSigned); + if (R.Coefficients.empty()) + return; + + Added |= CSToUse.addVariableRowFill(R.Coefficients); + + // If R has been added to the system, queue it for removal once it goes + // out-of-scope. + if (Added) { + SmallVector<Value *, 2> ValuesToRelease; + for (auto &KV : NewIndices) { + getValue2Index(R.IsSigned).insert(KV); + ValuesToRelease.push_back(KV.first); + } + + LLVM_DEBUG({ + dbgs() << " constraint: "; + dumpWithNames(R.Coefficients, getValue2Index(R.IsSigned)); + }); + + DFSInStack.emplace_back(NumIn, NumOut, IsNegated, R.IsSigned, + ValuesToRelease); + + if (R.IsEq) { + // Also add the inverted constraint for equality constraints. + for (auto &Coeff : R.Coefficients) + Coeff *= -1; + CSToUse.addVariableRowFill(R.Coefficients); + + DFSInStack.emplace_back(NumIn, NumOut, IsNegated, R.IsSigned, + SmallVector<Value *, 2>()); } + } +} + +static void +tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info, + SmallVectorImpl<Instruction *> &ToRemove) { + auto DoesConditionHold = [](CmpInst::Predicate Pred, Value *A, Value *B, + ConstraintInfo &Info) { + DenseMap<Value *, unsigned> NewIndices; + auto R = Info.getConstraint(Pred, A, B, NewIndices); + if (R.size() < 2 || R.needsNewIndices(NewIndices) || !R.isValid(Info)) + return false; + + auto &CSToUse = Info.getCS(CmpInst::isSigned(Pred)); + return CSToUse.isConditionImplied(R.Coefficients); + }; + + if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) { + // If A s>= B && B s>= 0, ssub.with.overflow(a, b) should not overflow and + // can be simplified to a regular sub. + Value *A = II->getArgOperand(0); + Value *B = II->getArgOperand(1); + if (!DoesConditionHold(CmpInst::ICMP_SGE, A, B, Info) || + !DoesConditionHold(CmpInst::ICMP_SGE, B, + ConstantInt::get(A->getType(), 0), Info)) + return; + + IRBuilder<> Builder(II->getParent(), II->getIterator()); + Value *Sub = nullptr; + for (User *U : make_early_inc_range(II->users())) { + if (match(U, m_ExtractValue<0>(m_Value()))) { + if (!Sub) + Sub = Builder.CreateSub(A, B); + U->replaceAllUsesWith(Sub); + } else if (match(U, m_ExtractValue<1>(m_Value()))) + U->replaceAllUsesWith(Builder.getFalse()); + else + continue; - // If the condition is an AND of 2 compares and the true successor only has - // the current block as predecessor, queue both conditions for the true - // successor. - if (match(Br->getCondition(), m_LogicalAnd(m_Value(Op0), m_Value(Op1))) && - match(Op0, m_Cmp()) && match(Op1, m_Cmp())) { - BasicBlock *TrueSuccessor = Br->getSuccessor(0); - if (CanAdd(TrueSuccessor)) { - WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op0), - false); - WorkList.emplace_back(DT.getNode(TrueSuccessor), cast<CmpInst>(Op1), - false); + if (U->use_empty()) { + auto *I = cast<Instruction>(U); + ToRemove.push_back(I); + I->setOperand(0, PoisonValue::get(II->getType())); } - continue; } - auto *CmpI = dyn_cast<CmpInst>(Br->getCondition()); - if (!CmpI) + if (II->use_empty()) + II->eraseFromParent(); + } +} + +static bool eliminateConstraints(Function &F, DominatorTree &DT) { + bool Changed = false; + DT.updateDFSNumbers(); + + ConstraintInfo Info; + State S(DT); + + // First, collect conditions implied by branches and blocks with their + // Dominator DFS in and out numbers. + for (BasicBlock &BB : F) { + if (!DT.getNode(&BB)) continue; - if (CanAdd(Br->getSuccessor(0))) - WorkList.emplace_back(DT.getNode(Br->getSuccessor(0)), CmpI, false); - if (CanAdd(Br->getSuccessor(1))) - WorkList.emplace_back(DT.getNode(Br->getSuccessor(1)), CmpI, true); + S.addInfoFor(BB); } // Next, sort worklist by dominance, so that dominating blocks and conditions // come before blocks and conditions dominated by them. If a block and a // condition have the same numbers, the condition comes before the block, as // it holds on entry to the block. - sort(WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) { + stable_sort(S.WorkList, [](const ConstraintOrBlock &A, const ConstraintOrBlock &B) { return std::tie(A.NumIn, A.IsBlock) < std::tie(B.NumIn, B.IsBlock); }); + SmallVector<Instruction *> ToRemove; + // Finally, process ordered worklist and eliminate implied conditions. SmallVector<StackEntry, 16> DFSInStack; - DenseMap<Value *, unsigned> Value2Index; - for (ConstraintOrBlock &CB : WorkList) { + for (ConstraintOrBlock &CB : S.WorkList) { // First, pop entries from the stack that are out-of-scope for CB. Remove // the corresponding entry from the constraint system. while (!DFSInStack.empty()) { @@ -409,10 +706,20 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { assert(E.NumIn <= CB.NumIn); if (CB.NumOut <= E.NumOut) break; - LLVM_DEBUG(dbgs() << "Removing " << *E.Condition << " " << E.IsNot - << "\n"); + LLVM_DEBUG({ + dbgs() << "Removing "; + dumpWithNames(Info.getCS(E.IsSigned).getLastConstraint(), + Info.getValue2Index(E.IsSigned)); + dbgs() << "\n"; + }); + + Info.popLastConstraint(E.IsSigned); + // Remove variables in the system that went out of scope. + auto &Mapping = Info.getValue2Index(E.IsSigned); + for (Value *V : E.ValuesToRelease) + Mapping.erase(V); + Info.popLastNVariables(E.IsSigned, E.ValuesToRelease.size()); DFSInStack.pop_back(); - CS.popLastConstraint(); } LLVM_DEBUG({ @@ -427,28 +734,30 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { // For a block, check if any CmpInsts become known based on the current set // of constraints. if (CB.IsBlock) { - for (Instruction &I : *CB.BB) { - auto *Cmp = dyn_cast<CmpInst>(&I); + for (Instruction &I : make_early_inc_range(*CB.BB)) { + if (auto *II = dyn_cast<WithOverflowInst>(&I)) { + tryToSimplifyOverflowMath(II, Info, ToRemove); + continue; + } + auto *Cmp = dyn_cast<ICmpInst>(&I); if (!Cmp) continue; DenseMap<Value *, unsigned> NewIndices; - auto R = getConstraint(Cmp, Value2Index, NewIndices); - if (R.size() != 1) - continue; - - if (R.needsNewIndices(NewIndices)) + auto R = Info.getConstraint(Cmp, NewIndices); + if (R.IsEq || R.empty() || R.needsNewIndices(NewIndices) || + !R.isValid(Info)) continue; - if (CS.isConditionImplied(R.get(0).Coefficients)) { + auto &CSToUse = Info.getCS(R.IsSigned); + if (CSToUse.isConditionImplied(R.Coefficients)) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; - LLVM_DEBUG(dbgs() << "Condition " << *Cmp - << " implied by dominating constraints\n"); LLVM_DEBUG({ - for (auto &E : reverse(DFSInStack)) - dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n"; + dbgs() << "Condition " << *Cmp + << " implied by dominating constraints\n"; + dumpWithNames(CSToUse, Info.getValue2Index(R.IsSigned)); }); Cmp->replaceUsesWithIf( ConstantInt::getTrue(F.getParent()->getContext()), [](Use &U) { @@ -460,16 +769,15 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { NumCondsRemoved++; Changed = true; } - if (CS.isConditionImplied( - ConstraintSystem::negate(R.get(0).Coefficients))) { + if (CSToUse.isConditionImplied( + ConstraintSystem::negate(R.Coefficients))) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; - LLVM_DEBUG(dbgs() << "Condition !" << *Cmp - << " implied by dominating constraints\n"); LLVM_DEBUG({ - for (auto &E : reverse(DFSInStack)) - dbgs() << " C " << *E.Condition << " " << E.IsNot << "\n"; + dbgs() << "Condition !" << *Cmp + << " implied by dominating constraints\n"; + dumpWithNames(CSToUse, Info.getValue2Index(R.IsSigned)); }); Cmp->replaceAllUsesWith( ConstantInt::getFalse(F.getParent()->getContext())); @@ -482,7 +790,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { // Set up a function to restore the predicate at the end of the scope if it // has been negated. Negate the predicate in-place, if required. - auto *CI = dyn_cast<CmpInst>(CB.Condition); + auto *CI = dyn_cast<ICmpInst>(CB.Condition); auto PredicateRestorer = make_scope_exit([CI, &CB]() { if (CB.Not && CI) CI->setPredicate(CI->getInversePredicate()); @@ -496,34 +804,28 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { } } - // Otherwise, add the condition to the system and stack, if we can transform - // it into a constraint. - DenseMap<Value *, unsigned> NewIndices; - auto R = getConstraint(CB.Condition, Value2Index, NewIndices); - if (R.empty()) - continue; - - for (auto &KV : NewIndices) - Value2Index.insert(KV); - - LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n"); - bool Added = false; - for (auto &C : R.Constraints) { - auto Coeffs = C.Coefficients; - LLVM_DEBUG({ - dbgs() << " constraint: "; - dumpWithNames(C, Value2Index); - }); - Added |= CS.addVariableRowFill(Coeffs); - // If R has been added to the system, queue it for removal once it goes - // out-of-scope. - if (Added) - DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not); + ICmpInst::Predicate Pred; + Value *A, *B; + if (match(CB.Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) { + // Otherwise, add the condition to the system and stack, if we can + // transform it into a constraint. + Info.addFact(Pred, A, B, CB.Not, CB.NumIn, CB.NumOut, DFSInStack); + Info.transferToOtherSystem(Pred, A, B, CB.Not, CB.NumIn, CB.NumOut, + DFSInStack); } } - assert(CS.size() == DFSInStack.size() && +#ifndef NDEBUG + unsigned SignedEntries = + count_if(DFSInStack, [](const StackEntry &E) { return E.IsSigned; }); + assert(Info.getCS(false).size() == DFSInStack.size() - SignedEntries && + "updates to CS and DFSInStack are out of sync"); + assert(Info.getCS(true).size() == SignedEntries && "updates to CS and DFSInStack are out of sync"); +#endif + + for (Instruction *I : ToRemove) + I->eraseFromParent(); return Changed; } diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index a3fd97079b1d..64bd4241f37c 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -41,8 +41,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> @@ -215,6 +213,53 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, return true; } +static Value *getValueOnEdge(LazyValueInfo *LVI, Value *Incoming, + BasicBlock *From, BasicBlock *To, + Instruction *CxtI) { + if (Constant *C = LVI->getConstantOnEdge(Incoming, From, To, CxtI)) + return C; + + // Look if the incoming value is a select with a scalar condition for which + // LVI can tells us the value. In that case replace the incoming value with + // the appropriate value of the select. This often allows us to remove the + // select later. + auto *SI = dyn_cast<SelectInst>(Incoming); + if (!SI) + return nullptr; + + // Once LVI learns to handle vector types, we could also add support + // for vector type constants that are not all zeroes or all ones. + Value *Condition = SI->getCondition(); + if (!Condition->getType()->isVectorTy()) { + if (Constant *C = LVI->getConstantOnEdge(Condition, From, To, CxtI)) { + if (C->isOneValue()) + return SI->getTrueValue(); + if (C->isZeroValue()) + return SI->getFalseValue(); + } + } + + // Look if the select has a constant but LVI tells us that the incoming + // value can never be that constant. In that case replace the incoming + // value with the other value of the select. This often allows us to + // remove the select later. + + // The "false" case + if (auto *C = dyn_cast<Constant>(SI->getFalseValue())) + if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) == + LazyValueInfo::False) + return SI->getTrueValue(); + + // The "true" case, + // similar to the select "false" case, but try the select "true" value + if (auto *C = dyn_cast<Constant>(SI->getTrueValue())) + if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) == + LazyValueInfo::False) + return SI->getFalseValue(); + + return nullptr; +} + static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, const SimplifyQuery &SQ) { bool Changed = false; @@ -224,53 +269,14 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, Value *Incoming = P->getIncomingValue(i); if (isa<Constant>(Incoming)) continue; - Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P); - - // Look if the incoming value is a select with a scalar condition for which - // LVI can tells us the value. In that case replace the incoming value with - // the appropriate value of the select. This often allows us to remove the - // select later. - if (!V) { - SelectInst *SI = dyn_cast<SelectInst>(Incoming); - if (!SI) continue; - - Value *Condition = SI->getCondition(); - if (!Condition->getType()->isVectorTy()) { - if (Constant *C = LVI->getConstantOnEdge( - Condition, P->getIncomingBlock(i), BB, P)) { - if (C->isOneValue()) { - V = SI->getTrueValue(); - } else if (C->isZeroValue()) { - V = SI->getFalseValue(); - } - // Once LVI learns to handle vector types, we could also add support - // for vector type constants that are not all zeroes or all ones. - } - } - - // Look if the select has a constant but LVI tells us that the incoming - // value can never be that constant. In that case replace the incoming - // value with the other value of the select. This often allows us to - // remove the select later. - if (!V) { - Constant *C = dyn_cast<Constant>(SI->getFalseValue()); - if (!C) continue; - - if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, - P->getIncomingBlock(i), BB, P) != - LazyValueInfo::False) - continue; - V = SI->getTrueValue(); - } - - LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n'); + Value *V = getValueOnEdge(LVI, Incoming, P->getIncomingBlock(i), BB, P); + if (V) { + P->setIncomingValue(i, V); + Changed = true; } - - P->setIncomingValue(i, V); - Changed = true; } - if (Value *V = SimplifyInstruction(P, SQ)) { + if (Value *V = simplifyInstruction(P, SQ)) { P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; @@ -575,7 +581,7 @@ static bool processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) { StructType *ST = cast<StructType>(WO->getType()); Constant *Struct = ConstantStruct::get(ST, - { UndefValue::get(ST->getElementType(0)), + { PoisonValue::get(ST->getElementType(0)), ConstantInt::getFalse(ST->getElementType(1)) }); Value *NewI = B.CreateInsertValue(Struct, NewOp, 0); WO->replaceAllUsesWith(NewI); @@ -735,8 +741,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can // prove that such a combination is impossible, we need to bump the bitwidth. if (CRs[1]->contains(APInt::getAllOnes(OrigWidth)) && - CRs[0]->contains( - APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth))) + CRs[0]->contains(APInt::getSignedMinValue(MinSignedBits).sext(OrigWidth))) ++MinSignedBits; // Don't shrink below 8 bits wide. @@ -955,7 +960,8 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { ++NumAShrsConverted; auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), - SDI->getName(), SDI); + "", SDI); + BO->takeName(SDI); BO->setDebugLoc(SDI->getDebugLoc()); BO->setIsExact(SDI->isExact()); SDI->replaceAllUsesWith(BO); @@ -974,8 +980,8 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { return false; ++NumSExt; - auto *ZExt = - CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI); + auto *ZExt = CastInst::CreateZExtOrBitCast(Base, SDI->getType(), "", SDI); + ZExt->takeName(SDI); ZExt->setDebugLoc(SDI->getDebugLoc()); SDI->replaceAllUsesWith(ZExt); SDI->eraseFromParent(); diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 143a78f604fc..5667eefabad5 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -60,30 +60,31 @@ #include "llvm/Transforms/Scalar/DFAJumpThreading.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <deque> +#ifdef EXPENSIVE_CHECKS +#include "llvm/IR/Verifier.h" +#endif + using namespace llvm; #define DEBUG_TYPE "dfa-jump-threading" @@ -102,6 +103,11 @@ static cl::opt<unsigned> MaxPathLength( cl::desc("Max number of blocks searched to find a threading path"), cl::Hidden, cl::init(20)); +static cl::opt<unsigned> MaxNumPaths( + "dfa-max-num-paths", + cl::desc("Max number of paths enumerated around a switch"), + cl::Hidden, cl::init(200)); + static cl::opt<unsigned> CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), @@ -414,7 +420,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { struct MainSwitch { MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) { - if (isPredictable(SI)) { + if (isCandidate(SI)) { Instr = SI; } else { ORE->emit([&]() { @@ -432,83 +438,60 @@ struct MainSwitch { } private: - /// Do a use-def chain traversal. Make sure the value of the switch variable - /// is always a known constant. This means that all conditional jumps based on - /// switch variable can be converted to unconditional jumps. - bool isPredictable(const SwitchInst *SI) { - std::deque<Instruction *> Q; + /// Do a use-def chain traversal starting from the switch condition to see if + /// \p SI is a potential condidate. + /// + /// Also, collect select instructions to unfold. + bool isCandidate(const SwitchInst *SI) { + std::deque<Value *> Q; SmallSet<Value *, 16> SeenValues; SelectInsts.clear(); - Value *FirstDef = SI->getOperand(0); - auto *Inst = dyn_cast<Instruction>(FirstDef); - - // If this is a function argument or another non-instruction, then give up. - // We are interested in loop local variables. - if (!Inst) - return false; - - // Require the first definition to be a PHINode - if (!isa<PHINode>(Inst)) + Value *SICond = SI->getCondition(); + LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n"); + if (!isa<PHINode>(SICond)) return false; - LLVM_DEBUG(dbgs() << "\tisPredictable() FirstDef: " << *Inst << "\n"); - - Q.push_back(Inst); - SeenValues.insert(FirstDef); + addToQueue(SICond, Q, SeenValues); while (!Q.empty()) { - Instruction *Current = Q.front(); + Value *Current = Q.front(); Q.pop_front(); if (auto *Phi = dyn_cast<PHINode>(Current)) { for (Value *Incoming : Phi->incoming_values()) { - if (!isPredictableValue(Incoming, SeenValues)) - return false; - addInstToQueue(Incoming, Q, SeenValues); + addToQueue(Incoming, Q, SeenValues); } - LLVM_DEBUG(dbgs() << "\tisPredictable() phi: " << *Phi << "\n"); + LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n"); } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) { if (!isValidSelectInst(SelI)) return false; - if (!isPredictableValue(SelI->getTrueValue(), SeenValues) || - !isPredictableValue(SelI->getFalseValue(), SeenValues)) { - return false; - } - addInstToQueue(SelI->getTrueValue(), Q, SeenValues); - addInstToQueue(SelI->getFalseValue(), Q, SeenValues); - LLVM_DEBUG(dbgs() << "\tisPredictable() select: " << *SelI << "\n"); + addToQueue(SelI->getTrueValue(), Q, SeenValues); + addToQueue(SelI->getFalseValue(), Q, SeenValues); + LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n"); if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back())) SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse)); + } else if (isa<Constant>(Current)) { + LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n"); + continue; } else { - // If it is neither a phi nor a select, then we give up. - return false; + LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n"); + // Allow unpredictable values. The hope is that those will be the + // initial switch values that can be ignored (they will hit the + // unthreaded switch) but this assumption will get checked later after + // paths have been enumerated (in function getStateDefMap). + continue; } } return true; } - bool isPredictableValue(Value *InpVal, SmallSet<Value *, 16> &SeenValues) { - if (SeenValues.contains(InpVal)) - return true; - - if (isa<ConstantInt>(InpVal)) - return true; - - // If this is a function argument or another non-instruction, then give up. - if (!isa<Instruction>(InpVal)) - return false; - - return true; - } - - void addInstToQueue(Value *Val, std::deque<Instruction *> &Q, - SmallSet<Value *, 16> &SeenValues) { + void addToQueue(Value *Val, std::deque<Value *> &Q, + SmallSet<Value *, 16> &SeenValues) { if (SeenValues.contains(Val)) return; - if (Instruction *I = dyn_cast<Instruction>(Val)) - Q.push_back(I); + Q.push_back(Val); SeenValues.insert(Val); } @@ -562,7 +545,16 @@ struct AllSwitchPaths { void run() { VisitedBlocks Visited; PathsType LoopPaths = paths(SwitchBlock, Visited, /* PathDepth = */ 1); - StateDefMap StateDef = getStateDefMap(); + StateDefMap StateDef = getStateDefMap(LoopPaths); + + if (StateDef.empty()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable", + Switch) + << "Switch instruction is not predictable."; + }); + return; + } for (PathType Path : LoopPaths) { ThreadingPath TPath; @@ -637,6 +629,9 @@ private: PathType NewPath(Path); NewPath.push_front(BB); Res.push_back(NewPath); + if (Res.size() >= MaxNumPaths) { + return Res; + } } } // This block could now be visited again from a different predecessor. Note @@ -647,14 +642,22 @@ private: } /// Walk the use-def chain and collect all the state-defining instructions. - StateDefMap getStateDefMap() const { + /// + /// Return an empty map if unpredictable values encountered inside the basic + /// blocks of \p LoopPaths. + StateDefMap getStateDefMap(const PathsType &LoopPaths) const { StateDefMap Res; + // Basic blocks belonging to any of the loops around the switch statement. + SmallPtrSet<BasicBlock *, 16> LoopBBs; + for (const PathType &Path : LoopPaths) { + for (BasicBlock *BB : Path) + LoopBBs.insert(BB); + } + Value *FirstDef = Switch->getOperand(0); - assert(isa<PHINode>(FirstDef) && "After select unfolding, all state " - "definitions are expected to be phi " - "nodes."); + assert(isa<PHINode>(FirstDef) && "The first definition must be a phi."); SmallVector<PHINode *, 8> Stack; Stack.push_back(dyn_cast<PHINode>(FirstDef)); @@ -666,15 +669,17 @@ private: Res[CurPhi->getParent()] = CurPhi; SeenValues.insert(CurPhi); - for (Value *Incoming : CurPhi->incoming_values()) { + for (BasicBlock *IncomingBB : CurPhi->blocks()) { + Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB); + bool IsOutsideLoops = LoopBBs.count(IncomingBB) == 0; if (Incoming == FirstDef || isa<ConstantInt>(Incoming) || - SeenValues.contains(Incoming)) { + SeenValues.contains(Incoming) || IsOutsideLoops) { continue; } - assert(isa<PHINode>(Incoming) && "After select unfolding, all state " - "definitions are expected to be phi " - "nodes."); + // Any unpredictable value inside the loops means we must bail out. + if (!isa<PHINode>(Incoming)) + return StateDefMap(); Stack.push_back(cast<PHINode>(Incoming)); } @@ -823,6 +828,16 @@ private: }); return false; } + + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " + << "instructions with invalid cost.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch) + << "Contains instructions with invalid cost."; + }); + return false; + } } unsigned DuplicationCost = 0; @@ -836,7 +851,7 @@ private: // using binary search, hence the LogBase2(). unsigned CondBranches = APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); - DuplicationCost = Metrics.NumInsts / CondBranches; + DuplicationCost = *Metrics.NumInsts.getValue() / CondBranches; } else { // Compared with jump tables, the DFA optimizer removes an indirect branch // on each loop iteration, thus making branch prediction more precise. The @@ -844,7 +859,7 @@ private: // predictor to make a mistake, and the more benefit there is in the DFA // optimizer. Thus, the more branch targets there are, the lower is the // cost of the DFA opt. - DuplicationCost = Metrics.NumInsts / JumpTableSize; + DuplicationCost = *Metrics.NumInsts.getValue() / JumpTableSize; } LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " @@ -1197,7 +1212,7 @@ private: PhiToRemove.push_back(Phi); } for (PHINode *PN : PhiToRemove) { - PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->replaceAllUsesWith(PoisonValue::get(PN->getType())); PN->eraseFromParent(); } return; @@ -1246,7 +1261,7 @@ private: /// Returns true if IncomingBB is a predecessor of BB. bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { - return llvm::find(predecessors(BB), IncomingBB) != pred_end(BB); + return llvm::is_contained(predecessors(BB), IncomingBB); } AllSwitchPaths *SwitchPaths; @@ -1278,7 +1293,7 @@ bool DFAJumpThreading::run(Function &F) { continue; LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() - << " is predictable\n"); + << " is a candidate\n"); MainSwitch Switch(SI, ORE); if (!Switch.getInstr()) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index ae636e7b61f7..4c42869dbd58 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -38,7 +38,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -62,8 +64,6 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -75,7 +75,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" @@ -83,7 +82,6 @@ #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> -#include <cstddef> #include <cstdint> #include <iterator> #include <map> @@ -766,20 +764,27 @@ struct DSEState { // Post-order numbers for each basic block. Used to figure out if memory // accesses are executed before another access. DenseMap<BasicBlock *, unsigned> PostOrderNumbers; + // Values that are only used with assumes. Used to refine pointer escape + // analysis. + SmallPtrSet<const Value *, 32> EphValues; /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs; + // Check if there are root nodes that are terminated by UnreachableInst. + // Those roots pessimize post-dominance queries. If there are such roots, + // fall back to CFG scan starting from all non-unreachable roots. + bool AnyUnreachableExit; // Class contains self-reference, make sure it's not copied/moved. DSEState(const DSEState &) = delete; DSEState &operator=(const DSEState &) = delete; DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, - PostDominatorTree &PDT, const TargetLibraryInfo &TLI, - const LoopInfo &LI) - : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { + PostDominatorTree &PDT, AssumptionCache &AC, + const TargetLibraryInfo &TLI, const LoopInfo &LI) + : F(F), AA(AA), EI(DT, LI, EphValues), BatchAA(AA, &EI), MSSA(MSSA), + DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -805,6 +810,12 @@ struct DSEState { // Collect whether there is any irreducible control flow in the function. ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); + + AnyUnreachableExit = any_of(PDT.roots(), [](const BasicBlock *E) { + return isa<UnreachableInst>(E->getTerminator()); + }); + + CodeMetrics::collectEphemeralValues(&F, &AC, EphValues); } /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p @@ -951,7 +962,7 @@ struct DSEState { if (!isInvisibleToCallerOnUnwind(V)) { I.first->second = false; } else if (isNoAliasCall(V)) { - I.first->second = !PointerMayBeCaptured(V, true, false); + I.first->second = !PointerMayBeCaptured(V, true, false, EphValues); } } return I.first->second; @@ -970,7 +981,7 @@ struct DSEState { // with the killing MemoryDef. But we refrain from doing so for now to // limit compile-time and this does not cause any changes to the number // of stores removed on a large test set in practice. - I.first->second = PointerMayBeCaptured(V, false, true); + I.first->second = PointerMayBeCaptured(V, false, true, EphValues); return !I.first->second; } @@ -1003,7 +1014,8 @@ struct DSEState { if (CB->isLifetimeStartOrEnd()) return false; - return CB->use_empty() && CB->willReturn() && CB->doesNotThrow(); + return CB->use_empty() && CB->willReturn() && CB->doesNotThrow() && + !CB->isTerminator(); } return false; @@ -1233,6 +1245,9 @@ struct DSEState { // Reached TOP. if (MSSA.isLiveOnEntryDef(Current)) { LLVM_DEBUG(dbgs() << " ... found LiveOnEntryDef\n"); + if (CanOptimize && Current != KillingDef->getDefiningAccess()) + // The first clobbering def is... none. + KillingDef->setOptimized(Current); return None; } @@ -1309,7 +1324,6 @@ struct DSEState { // memory location and not located in different loops. if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); - WalkerStepLimit -= 1; CanOptimize = false; continue; } @@ -1508,54 +1522,56 @@ struct DSEState { CommonPred = PDT.findNearestCommonDominator(CommonPred, BB); } - // If CommonPred is in the set of killing blocks, just check if it - // post-dominates MaybeDeadAccess. - if (KillingBlocks.count(CommonPred)) { - if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) - return {MaybeDeadAccess}; - return None; - } - // If the common post-dominator does not post-dominate MaybeDeadAccess, // there is a path from MaybeDeadAccess to an exit not going through a // killing block. - if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) { - SetVector<BasicBlock *> WorkList; + if (!PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) { + if (!AnyUnreachableExit) + return None; - // If CommonPred is null, there are multiple exits from the function. - // They all have to be added to the worklist. - if (CommonPred) - WorkList.insert(CommonPred); - else - for (BasicBlock *R : PDT.roots()) + // Fall back to CFG scan starting at all non-unreachable roots if not + // all paths to the exit go through CommonPred. + CommonPred = nullptr; + } + + // If CommonPred itself is in the set of killing blocks, we're done. + if (KillingBlocks.count(CommonPred)) + return {MaybeDeadAccess}; + + SetVector<BasicBlock *> WorkList; + // If CommonPred is null, there are multiple exits from the function. + // They all have to be added to the worklist. + if (CommonPred) + WorkList.insert(CommonPred); + else + for (BasicBlock *R : PDT.roots()) { + if (!isa<UnreachableInst>(R->getTerminator())) WorkList.insert(R); + } - NumCFGTries++; - // Check if all paths starting from an exit node go through one of the - // killing blocks before reaching MaybeDeadAccess. - for (unsigned I = 0; I < WorkList.size(); I++) { - NumCFGChecks++; - BasicBlock *Current = WorkList[I]; - if (KillingBlocks.count(Current)) - continue; - if (Current == MaybeDeadAccess->getBlock()) - return None; + NumCFGTries++; + // Check if all paths starting from an exit node go through one of the + // killing blocks before reaching MaybeDeadAccess. + for (unsigned I = 0; I < WorkList.size(); I++) { + NumCFGChecks++; + BasicBlock *Current = WorkList[I]; + if (KillingBlocks.count(Current)) + continue; + if (Current == MaybeDeadAccess->getBlock()) + return None; - // MaybeDeadAccess is reachable from the entry, so we don't have to - // explore unreachable blocks further. - if (!DT.isReachableFromEntry(Current)) - continue; + // MaybeDeadAccess is reachable from the entry, so we don't have to + // explore unreachable blocks further. + if (!DT.isReachableFromEntry(Current)) + continue; - for (BasicBlock *Pred : predecessors(Current)) - WorkList.insert(Pred); + for (BasicBlock *Pred : predecessors(Current)) + WorkList.insert(Pred); - if (WorkList.size() >= MemorySSAPathCheckLimit) - return None; - } - NumCFGSuccess++; - return {MaybeDeadAccess}; + if (WorkList.size() >= MemorySSAPathCheckLimit) + return None; } - return None; + NumCFGSuccess++; } // No aliasing MemoryUses of MaybeDeadAccess found, MaybeDeadAccess is @@ -1780,10 +1796,9 @@ struct DSEState { if (!isRemovable(DefI)) return false; - if (StoredConstant && isAllocationFn(DefUO, &TLI)) { - auto *CB = cast<CallBase>(DefUO); - auto *InitC = getInitialValueOfAllocation(CB, &TLI, - StoredConstant->getType()); + if (StoredConstant) { + Constant *InitC = + getInitialValueOfAllocation(DefUO, &TLI, StoredConstant->getType()); // If the clobbering access is LiveOnEntry, no instructions between them // can modify the memory location. if (InitC && InitC == StoredConstant) @@ -1921,11 +1936,13 @@ struct DSEState { static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, + AssumptionCache &AC, const TargetLibraryInfo &TLI, const LoopInfo &LI) { bool MadeChange = false; - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); + MSSA.ensureOptimizedUses(); + DSEState State(F, AA, MSSA, DT, PDT, AC, TLI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2105,9 +2122,10 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); + AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); LoopInfo &LI = AM.getResult<LoopAnalysis>(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2147,9 +2165,11 @@ public: MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); PostDominatorTree &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + AssumptionCache &AC = + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) @@ -2173,6 +2193,7 @@ public: AU.addPreserved<MemorySSAWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); } }; @@ -2190,6 +2211,7 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 59b934c16c8a..cf2824954122 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopedHashTable.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" @@ -30,19 +29,16 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" -#include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -55,7 +51,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" -#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <deque> @@ -781,6 +776,21 @@ private: return getLoadStorePointerOperand(Inst); } + Type *getValueType() const { + // TODO: handle target-specific intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::masked_load: + return II->getType(); + case Intrinsic::masked_store: + return II->getArgOperand(0)->getType(); + default: + return nullptr; + } + } + return getLoadStoreType(Inst); + } + bool mayReadFromMemory() const { if (IntrID != 0) return Info.ReadMem; @@ -1162,6 +1172,9 @@ bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier, "Violated invariant"); if (Earlier.getPointerOperand() != Later.getPointerOperand()) return false; + if (!Earlier.getValueType() || !Later.getValueType() || + Earlier.getValueType() != Later.getValueType()) + return false; if (Earlier.getMatchingId() != Later.getMatchingId()) return false; // At the moment, we don't remove ordered stores, but do remove @@ -1334,7 +1347,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(&Inst, SQ)) { + if (Value *V = simplifyInstruction(&Inst, SQ)) { LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << " to: " << *V << '\n'); if (!DebugCounter::shouldExecute(CSECounter)) { diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 44017b555769..ad2041cd4253 100644 --- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,8 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp index a98bb8358aef..56f2a3b3004d 100644 --- a/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -11,24 +11,22 @@ // //===----------------------------------------------------------------------===// -#include "llvm/InitializePasses.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar/Float2Int.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include <deque> -#include <functional> // For std::function #define DEBUG_TYPE "float2int" @@ -236,116 +234,111 @@ void Float2IntPass::walkBackwards() { } } -// Walk forwards down the list of seen instructions, so we visit defs before -// uses. -void Float2IntPass::walkForwards() { - for (auto &It : reverse(SeenInsts)) { - if (It.second != unknownRange()) - continue; +// Calculate result range from operand ranges. +// Return None if the range cannot be calculated yet. +Optional<ConstantRange> Float2IntPass::calcRange(Instruction *I) { + SmallVector<ConstantRange, 4> OpRanges; + for (Value *O : I->operands()) { + if (Instruction *OI = dyn_cast<Instruction>(O)) { + auto OpIt = SeenInsts.find(OI); + assert(OpIt != SeenInsts.end() && "def not seen before use!"); + if (OpIt->second == unknownRange()) + return None; // Wait until operand range has been calculated. + OpRanges.push_back(OpIt->second); + } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { + // Work out if the floating point number can be losslessly represented + // as an integer. + // APFloat::convertToInteger(&Exact) purports to do what we want, but + // the exactness can be too precise. For example, negative zero can + // never be exactly converted to an integer. + // + // Instead, we ask APFloat to round itself to an integral value - this + // preserves sign-of-zero - then compare the result with the original. + // + const APFloat &F = CF->getValueAPF(); - Instruction *I = It.first; - std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; - switch (I->getOpcode()) { - // FIXME: Handle select and phi nodes. - default: - case Instruction::UIToFP: - case Instruction::SIToFP: - llvm_unreachable("Should have been handled in walkForwards!"); + // First, weed out obviously incorrect values. Non-finite numbers + // can't be represented and neither can negative zero, unless + // we're in fast math mode. + if (!F.isFinite() || + (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && + !I->hasNoSignedZeros())) + return badRange(); - case Instruction::FNeg: - Op = [](ArrayRef<ConstantRange> Ops) { - assert(Ops.size() == 1 && "FNeg is a unary operator!"); - unsigned Size = Ops[0].getBitWidth(); - auto Zero = ConstantRange(APInt::getZero(Size)); - return Zero.sub(Ops[0]); - }; - break; + APFloat NewF = F; + auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven); + if (Res != APFloat::opOK || NewF != F) + return badRange(); - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - Op = [I](ArrayRef<ConstantRange> Ops) { - assert(Ops.size() == 2 && "its a binary operator!"); - auto BinOp = (Instruction::BinaryOps) I->getOpcode(); - return Ops[0].binaryOp(BinOp, Ops[1]); - }; - break; + // OK, it's representable. Now get it. + APSInt Int(MaxIntegerBW+1, false); + bool Exact; + CF->getValueAPF().convertToInteger(Int, + APFloat::rmNearestTiesToEven, + &Exact); + OpRanges.push_back(ConstantRange(Int)); + } else { + llvm_unreachable("Should have already marked this as badRange!"); + } + } - // - // Root-only instructions - we'll only see these if they're the - // first node in a walk. - // - case Instruction::FPToUI: - case Instruction::FPToSI: - Op = [I](ArrayRef<ConstantRange> Ops) { - assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!"); - // Note: We're ignoring the casts output size here as that's what the - // caller expects. - auto CastOp = (Instruction::CastOps)I->getOpcode(); - return Ops[0].castOp(CastOp, MaxIntegerBW+1); - }; - break; + switch (I->getOpcode()) { + // FIXME: Handle select and phi nodes. + default: + case Instruction::UIToFP: + case Instruction::SIToFP: + llvm_unreachable("Should have been handled in walkForwards!"); - case Instruction::FCmp: - Op = [](ArrayRef<ConstantRange> Ops) { - assert(Ops.size() == 2 && "FCmp is a binary operator!"); - return Ops[0].unionWith(Ops[1]); - }; - break; - } + case Instruction::FNeg: { + assert(OpRanges.size() == 1 && "FNeg is a unary operator!"); + unsigned Size = OpRanges[0].getBitWidth(); + auto Zero = ConstantRange(APInt::getZero(Size)); + return Zero.sub(OpRanges[0]); + } - bool Abort = false; - SmallVector<ConstantRange,4> OpRanges; - for (Value *O : I->operands()) { - if (Instruction *OI = dyn_cast<Instruction>(O)) { - assert(SeenInsts.find(OI) != SeenInsts.end() && - "def not seen before use!"); - OpRanges.push_back(SeenInsts.find(OI)->second); - } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { - // Work out if the floating point number can be losslessly represented - // as an integer. - // APFloat::convertToInteger(&Exact) purports to do what we want, but - // the exactness can be too precise. For example, negative zero can - // never be exactly converted to an integer. - // - // Instead, we ask APFloat to round itself to an integral value - this - // preserves sign-of-zero - then compare the result with the original. - // - const APFloat &F = CF->getValueAPF(); + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: { + assert(OpRanges.size() == 2 && "its a binary operator!"); + auto BinOp = (Instruction::BinaryOps) I->getOpcode(); + return OpRanges[0].binaryOp(BinOp, OpRanges[1]); + } - // First, weed out obviously incorrect values. Non-finite numbers - // can't be represented and neither can negative zero, unless - // we're in fast math mode. - if (!F.isFinite() || - (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && - !I->hasNoSignedZeros())) { - seen(I, badRange()); - Abort = true; - break; - } + // + // Root-only instructions - we'll only see these if they're the + // first node in a walk. + // + case Instruction::FPToUI: + case Instruction::FPToSI: { + assert(OpRanges.size() == 1 && "FPTo[US]I is a unary operator!"); + // Note: We're ignoring the casts output size here as that's what the + // caller expects. + auto CastOp = (Instruction::CastOps)I->getOpcode(); + return OpRanges[0].castOp(CastOp, MaxIntegerBW+1); + } - APFloat NewF = F; - auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven); - if (Res != APFloat::opOK || NewF != F) { - seen(I, badRange()); - Abort = true; - break; - } - // OK, it's representable. Now get it. - APSInt Int(MaxIntegerBW+1, false); - bool Exact; - CF->getValueAPF().convertToInteger(Int, - APFloat::rmNearestTiesToEven, - &Exact); - OpRanges.push_back(ConstantRange(Int)); - } else { - llvm_unreachable("Should have already marked this as badRange!"); - } - } + case Instruction::FCmp: + assert(OpRanges.size() == 2 && "FCmp is a binary operator!"); + return OpRanges[0].unionWith(OpRanges[1]); + } +} + +// Walk forwards down the list of seen instructions, so we visit defs before +// uses. +void Float2IntPass::walkForwards() { + std::deque<Instruction *> Worklist; + for (const auto &Pair : SeenInsts) + if (Pair.second == unknownRange()) + Worklist.push_back(Pair.first); + + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); - // Reduce the operands' ranges to a single range and return. - if (!Abort) - seen(I, Op(OpRanges)); + if (Optional<ConstantRange> Range = calcRange(I)) + seen(I, *Range); + else + Worklist.push_front(I); // Reprocess later. } } diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 398c93e8758c..783301fe589e 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -19,7 +19,6 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -32,6 +31,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionPrecedenceTracking.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -42,12 +42,10 @@ #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Config/llvm-config.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -55,11 +53,9 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -72,7 +68,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -112,16 +107,16 @@ static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre", cl::init(true)); static cl::opt<bool> GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre", - cl::init(true)); + cl::init(false)); static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true)); static cl::opt<uint32_t> MaxNumDeps( - "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore, + "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::desc("Max number of dependences to attempt Load PRE (default = 100)")); // This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat. static cl::opt<uint32_t> MaxBBSpeculations( - "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore, + "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::desc("Max number of blocks we're willing to speculate on (and recurse " "into) when deducing if a value is fully available or not in GVN " "(default = 600)")); @@ -129,6 +124,8 @@ static cl::opt<uint32_t> MaxBBSpeculations( struct llvm::GVNPass::Expression { uint32_t opcode; bool commutative = false; + // The type is not necessarily the result type of the expression, it may be + // any additional type needed to disambiguate the expression. Type *type = nullptr; SmallVector<uint32_t, 4> varargs; @@ -178,70 +175,88 @@ template <> struct DenseMapInfo<GVNPass::Expression> { /// implicitly associated with a rematerialization point which is the /// location of the instruction from which it was formed. struct llvm::gvn::AvailableValue { - enum ValType { + enum class ValType { SimpleVal, // A simple offsetted value that is accessed. LoadVal, // A value produced by a load. MemIntrin, // A memory intrinsic which is loaded from. - UndefVal // A UndefValue representing a value from dead block (which + UndefVal, // A UndefValue representing a value from dead block (which // is not yet physically removed from the CFG). + SelectVal, // A pointer select which is loaded from and for which the load + // can be replace by a value select. }; - /// V - The value that is live out of the block. - PointerIntPair<Value *, 2, ValType> Val; + /// Val - The value that is live out of the block. + Value *Val; + /// Kind of the live-out value. + ValType Kind; /// Offset - The byte offset in Val that is interesting for the load query. unsigned Offset = 0; static AvailableValue get(Value *V, unsigned Offset = 0) { AvailableValue Res; - Res.Val.setPointer(V); - Res.Val.setInt(SimpleVal); + Res.Val = V; + Res.Kind = ValType::SimpleVal; Res.Offset = Offset; return Res; } static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) { AvailableValue Res; - Res.Val.setPointer(MI); - Res.Val.setInt(MemIntrin); + Res.Val = MI; + Res.Kind = ValType::MemIntrin; Res.Offset = Offset; return Res; } static AvailableValue getLoad(LoadInst *Load, unsigned Offset = 0) { AvailableValue Res; - Res.Val.setPointer(Load); - Res.Val.setInt(LoadVal); + Res.Val = Load; + Res.Kind = ValType::LoadVal; Res.Offset = Offset; return Res; } static AvailableValue getUndef() { AvailableValue Res; - Res.Val.setPointer(nullptr); - Res.Val.setInt(UndefVal); + Res.Val = nullptr; + Res.Kind = ValType::UndefVal; Res.Offset = 0; return Res; } - bool isSimpleValue() const { return Val.getInt() == SimpleVal; } - bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } - bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } - bool isUndefValue() const { return Val.getInt() == UndefVal; } + static AvailableValue getSelect(SelectInst *Sel) { + AvailableValue Res; + Res.Val = Sel; + Res.Kind = ValType::SelectVal; + Res.Offset = 0; + return Res; + } + + bool isSimpleValue() const { return Kind == ValType::SimpleVal; } + bool isCoercedLoadValue() const { return Kind == ValType::LoadVal; } + bool isMemIntrinValue() const { return Kind == ValType::MemIntrin; } + bool isUndefValue() const { return Kind == ValType::UndefVal; } + bool isSelectValue() const { return Kind == ValType::SelectVal; } Value *getSimpleValue() const { assert(isSimpleValue() && "Wrong accessor"); - return Val.getPointer(); + return Val; } LoadInst *getCoercedLoadValue() const { assert(isCoercedLoadValue() && "Wrong accessor"); - return cast<LoadInst>(Val.getPointer()); + return cast<LoadInst>(Val); } MemIntrinsic *getMemIntrinValue() const { assert(isMemIntrinValue() && "Wrong accessor"); - return cast<MemIntrinsic>(Val.getPointer()); + return cast<MemIntrinsic>(Val); + } + + SelectInst *getSelectValue() const { + assert(isSelectValue() && "Wrong accessor"); + return cast<SelectInst>(Val); } /// Emit code at the specified insertion point to adjust the value defined @@ -275,6 +290,10 @@ struct llvm::gvn::AvailableValueInBlock { return get(BB, AvailableValue::getUndef()); } + static AvailableValueInBlock getSelect(BasicBlock *BB, SelectInst *Sel) { + return get(BB, AvailableValue::getSelect(Sel)); + } + /// Emit code at the end of this block to adjust the value defined here to /// the specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(LoadInst *Load, GVNPass &gvn) const { @@ -379,6 +398,39 @@ GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { return e; } +GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) { + Expression E; + Type *PtrTy = GEP->getType()->getScalarType(); + const DataLayout &DL = GEP->getModule()->getDataLayout(); + unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy); + MapVector<Value *, APInt> VariableOffsets; + APInt ConstantOffset(BitWidth, 0); + if (PtrTy->isOpaquePointerTy() && + GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) { + // For opaque pointers, convert into offset representation, to recognize + // equivalent address calculations that use different type encoding. + LLVMContext &Context = GEP->getContext(); + E.opcode = GEP->getOpcode(); + E.type = nullptr; + E.varargs.push_back(lookupOrAdd(GEP->getPointerOperand())); + for (const auto &Pair : VariableOffsets) { + E.varargs.push_back(lookupOrAdd(Pair.first)); + E.varargs.push_back(lookupOrAdd(ConstantInt::get(Context, Pair.second))); + } + if (!ConstantOffset.isZero()) + E.varargs.push_back( + lookupOrAdd(ConstantInt::get(Context, ConstantOffset))); + } else { + // If converting to offset representation fails (for typed pointers and + // scalable vectors), fall back to type-based implementation: + E.opcode = GEP->getOpcode(); + E.type = GEP->getSourceElementType(); + for (Use &Op : GEP->operands()) + E.varargs.push_back(lookupOrAdd(Op)); + } + return E; +} + //===----------------------------------------------------------------------===// // ValueTable External Functions //===----------------------------------------------------------------------===// @@ -562,9 +614,11 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) { case Instruction::InsertElement: case Instruction::ShuffleVector: case Instruction::InsertValue: - case Instruction::GetElementPtr: exp = createExpr(I); break; + case Instruction::GetElementPtr: + exp = createGEPExpr(cast<GetElementPtrInst>(I)); + break; case Instruction::ExtractValue: exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; @@ -639,24 +693,24 @@ void GVNPass::ValueTable::verifyRemoved(const Value *V) const { //===----------------------------------------------------------------------===// bool GVNPass::isPREEnabled() const { - return Options.AllowPRE.getValueOr(GVNEnablePRE); + return Options.AllowPRE.value_or(GVNEnablePRE); } bool GVNPass::isLoadPREEnabled() const { - return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE); + return Options.AllowLoadPRE.value_or(GVNEnableLoadPRE); } bool GVNPass::isLoadInLoopPREEnabled() const { - return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE); + return Options.AllowLoadInLoopPRE.value_or(GVNEnableLoadInLoopPRE); } bool GVNPass::isLoadPRESplitBackedgeEnabled() const { - return Options.AllowLoadPRESplitBackedge.getValueOr( + return Options.AllowLoadPRESplitBackedge.value_or( GVNEnableSplitBackedgeInLoadPRE); } bool GVNPass::isMemDepEnabled() const { - return Options.AllowMemDep.getValueOr(GVNEnableMemDep); + return Options.AllowMemDep.value_or(GVNEnableMemDep); } PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -897,6 +951,17 @@ ConstructSSAForLoadSet(LoadInst *Load, return SSAUpdate.GetValueInMiddleOfBlock(Load->getParent()); } +static LoadInst *findDominatingLoad(Value *Ptr, Type *LoadTy, SelectInst *Sel, + DominatorTree &DT) { + for (Value *U : Ptr->users()) { + auto *LI = dyn_cast<LoadInst>(U); + if (LI && LI->getType() == LoadTy && LI->getParent() == Sel->getParent() && + DT.dominates(LI, Sel)) + return LI; + } + return nullptr; +} + Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, Instruction *InsertPt, GVNPass &gvn) const { @@ -937,6 +1002,17 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); + } else if (isSelectValue()) { + // Introduce a new value select for a load from an eligible pointer select. + SelectInst *Sel = getSelectValue(); + LoadInst *L1 = findDominatingLoad(Sel->getOperand(1), LoadTy, Sel, + gvn.getDominatorTree()); + LoadInst *L2 = findDominatingLoad(Sel->getOperand(2), LoadTy, Sel, + gvn.getDominatorTree()); + assert(L1 && L2 && + "must be able to obtain dominating loads for both value operands of " + "the select"); + Res = SelectInst::Create(Sel->getCondition(), L1, L2, "", Sel); } else { llvm_unreachable("Should not materialize value from dead block"); } @@ -1023,8 +1099,54 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo, ORE->emit(R); } +/// Check if a load from pointer-select \p Address in \p DepBB can be converted +/// to a value select. The following conditions need to be satisfied: +/// 1. The pointer select (\p Address) must be defined in \p DepBB. +/// 2. Both value operands of the pointer select must be loaded in the same +/// basic block, before the pointer select. +/// 3. There must be no instructions between the found loads and \p End that may +/// clobber the loads. +static Optional<AvailableValue> +tryToConvertLoadOfPtrSelect(BasicBlock *DepBB, BasicBlock::iterator End, + Value *Address, Type *LoadTy, DominatorTree &DT, + AAResults *AA) { + + auto *Sel = dyn_cast_or_null<SelectInst>(Address); + if (!Sel || DepBB != Sel->getParent()) + return None; + + LoadInst *L1 = findDominatingLoad(Sel->getOperand(1), LoadTy, Sel, DT); + LoadInst *L2 = findDominatingLoad(Sel->getOperand(2), LoadTy, Sel, DT); + if (!L1 || !L2) + return None; + + // Ensure there are no accesses that may modify the locations referenced by + // either L1 or L2 between L1, L2 and the specified End iterator. + Instruction *EarlierLoad = L1->comesBefore(L2) ? L1 : L2; + MemoryLocation L1Loc = MemoryLocation::get(L1); + MemoryLocation L2Loc = MemoryLocation::get(L2); + if (any_of(make_range(EarlierLoad->getIterator(), End), [&](Instruction &I) { + return isModSet(AA->getModRefInfo(&I, L1Loc)) || + isModSet(AA->getModRefInfo(&I, L2Loc)); + })) + return None; + + return AvailableValue::getSelect(Sel); +} + bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, Value *Address, AvailableValue &Res) { + if (!DepInfo.isDef() && !DepInfo.isClobber()) { + assert(isa<SelectInst>(Address)); + if (auto R = tryToConvertLoadOfPtrSelect( + Load->getParent(), Load->getIterator(), Address, Load->getType(), + getDominatorTree(), getAliasAnalysis())) { + Res = *R; + return true; + } + return false; + } + assert((DepInfo.isDef() || DepInfo.isClobber()) && "expected a local dependence"); assert(Load->isUnordered() && "rules below are incorrect for ordered access"); @@ -1066,9 +1188,7 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) { const auto ClobberOff = MD->getClobberOffset(DepLoad); // GVN has no deal with a negative offset. - Offset = (ClobberOff == None || ClobberOff.getValue() < 0) - ? -1 - : ClobberOff.getValue(); + Offset = (ClobberOff == None || *ClobberOff < 0) ? -1 : *ClobberOff; } if (Offset == -1) Offset = @@ -1092,6 +1212,7 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, } } } + // Nothing known about this clobber, have to be conservative LLVM_DEBUG( // fast print dep, using operator<< on instruction is too slow. @@ -1111,12 +1232,11 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, return true; } - if (isAllocationFn(DepInst, TLI)) - if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), - TLI, Load->getType())) { - Res = AvailableValue::get(InitVal); - return true; - } + if (Constant *InitVal = + getInitialValueOfAllocation(DepInst, TLI, Load->getType())) { + Res = AvailableValue::get(InitVal); + return true; + } if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of @@ -1176,16 +1296,23 @@ void GVNPass::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps, continue; } - if (!DepInfo.isDef() && !DepInfo.isClobber()) { - UnavailableBlocks.push_back(DepBB); - continue; - } - // The address being loaded in this non-local block may not be the same as // the pointer operand of the load if PHI translation occurs. Make sure // to consider the right address. Value *Address = Deps[i].getAddress(); + if (!DepInfo.isDef() && !DepInfo.isClobber()) { + if (auto R = tryToConvertLoadOfPtrSelect( + DepBB, DepBB->end(), Address, Load->getType(), getDominatorTree(), + getAliasAnalysis())) { + ValuesPerBlock.push_back( + AvailableValueInBlock::get(DepBB, std::move(*R))); + continue; + } + UnavailableBlocks.push_back(DepBB); + continue; + } + AvailableValue AV; if (AnalyzeLoadAvailability(Load, DepInfo, Address, AV)) { // subtlety: because we know this was a non-local dependency, we know @@ -1923,8 +2050,9 @@ bool GVNPass::processLoad(LoadInst *L) { if (Dep.isNonLocal()) return processNonLocalLoad(L); + Value *Address = L->getPointerOperand(); // Only handle the local case below - if (!Dep.isDef() && !Dep.isClobber()) { + if (!Dep.isDef() && !Dep.isClobber() && !isa<SelectInst>(Address)) { // This might be a NonFuncLocal or an Unknown LLVM_DEBUG( // fast print dep, using operator<< on instruction is too slow. @@ -1934,7 +2062,7 @@ bool GVNPass::processLoad(LoadInst *L) { } AvailableValue AV; - if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) { + if (AnalyzeLoadAvailability(L, Dep, Address, AV)) { Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this); // Replace the load! @@ -2324,7 +2452,7 @@ bool GVNPass::processInstruction(Instruction *I) { // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. const DataLayout &DL = I->getModule()->getDataLayout(); - if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) { + if (Value *V = simplifyInstruction(I, {DL, TLI, DT, AC})) { bool Changed = false; if (!I->use_empty()) { // Simplification can cause a special instruction to become not special. @@ -2491,6 +2619,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, unsigned Iteration = 0; while (ShouldContinue) { LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); + (void) Iteration; ShouldContinue = iterateOnFunction(F); Changed |= ShouldContinue; ++Iteration; diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp index fdc3afd9348a..6cdc671ddb64 100644 --- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -54,11 +54,9 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Use.h" @@ -126,7 +124,7 @@ using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>; using HoistingPointList = SmallVector<HoistingPointInfo, 4>; // A map from a pair of VNs to all the instructions with those VNs. -using VNType = std::pair<unsigned, unsigned>; +using VNType = std::pair<unsigned, uintptr_t>; using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>; @@ -161,7 +159,7 @@ using InValuesType = // An invalid value number Used when inserting a single value number into // VNtoInsns. -enum : unsigned { InvalidVN = ~2U }; +enum : uintptr_t { InvalidVN = ~(uintptr_t)2 }; // Records all scalar instructions candidate for code hoisting. class InsnInfo { @@ -187,7 +185,9 @@ public: void insert(LoadInst *Load, GVNPass::ValueTable &VN) { if (Load->isSimple()) { unsigned V = VN.lookupOrAdd(Load->getPointerOperand()); - VNtoLoads[{V, InvalidVN}].push_back(Load); + // With opaque pointers we may have loads from the same pointer with + // different result types, which should be disambiguated. + VNtoLoads[{V, (uintptr_t)Load->getType()}].push_back(Load); } } @@ -261,7 +261,9 @@ public: GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA, MemoryDependenceResults *MD, MemorySSA *MSSA) : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA), - MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} + MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) { + MSSA->ensureOptimizedUses(); + } bool run(Function &F); @@ -1147,6 +1149,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) { DFSNumber[Repl] = DFSNumber[Last]++; } + // Drop debug location as per debug info update guide. + Repl->dropLocation(); NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess); if (isa<LoadInst>(Repl)) diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index e612a82fc89a..720b8e71fd56 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -35,7 +35,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/None.h" @@ -45,7 +44,6 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -383,6 +381,8 @@ public: } }; +using BasicBlocksSet = SmallPtrSet<const BasicBlock *, 32>; + class ValueTable { DenseMap<Value *, uint32_t> ValueNumbering; DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering; @@ -390,6 +390,7 @@ class ValueTable { BumpPtrAllocator Allocator; ArrayRecycler<Value *> Recycler; uint32_t nextValueNumber = 1; + BasicBlocksSet ReachableBBs; /// Create an expression for I based on its opcode and its uses. If I /// touches or reads memory, the expression is also based upon its memory @@ -421,6 +422,11 @@ class ValueTable { public: ValueTable() = default; + /// Set basic blocks reachable from entry block. + void setReachableBBs(const BasicBlocksSet &ReachableBBs) { + this->ReachableBBs = ReachableBBs; + } + /// Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t lookupOrAdd(Value *V) { @@ -434,6 +440,9 @@ public: } Instruction *I = cast<Instruction>(V); + if (!ReachableBBs.contains(I->getParent())) + return ~0U; + InstructionUseExpr *exp = nullptr; switch (I->getOpcode()) { case Instruction::Load: @@ -570,6 +579,7 @@ public: unsigned NumSunk = 0; ReversePostOrderTraversal<Function*> RPOT(&F); + VN.setReachableBBs(BasicBlocksSet(RPOT.begin(), RPOT.end())); for (auto *N : RPOT) NumSunk += sinkBB(N); @@ -648,12 +658,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( VNums[N]++; } unsigned VNumToSink = - std::max_element(VNums.begin(), VNums.end(), - [](const std::pair<uint32_t, unsigned> &I, - const std::pair<uint32_t, unsigned> &J) { - return I.second < J.second; - }) - ->first; + std::max_element(VNums.begin(), VNums.end(), llvm::less_second())->first; if (VNums[VNumToSink] == 1) // Can't sink anything! @@ -776,12 +781,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { unsigned NumOrigPreds = Preds.size(); // We can only sink instructions through unconditional branches. - for (auto I = Preds.begin(); I != Preds.end();) { - if ((*I)->getTerminator()->getNumSuccessors() != 1) - I = Preds.erase(I); - else - ++I; - } + llvm::erase_if(Preds, [](BasicBlock *BB) { + return BB->getTerminator()->getNumSuccessors() != 1; + }); LockstepReverseIterator LRI(Preds); SmallVector<SinkingInstructionCandidate, 4> Candidates; diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 82b81003ef21..af6062d142f0 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -42,7 +42,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -496,6 +495,8 @@ void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const { makeAvailableAt(Op, Loc); Inst->moveBefore(Loc); + // If we moved instruction before guard we must clean poison generating flags. + Inst->dropPoisonGeneratingFlags(); } bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, diff --git a/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp b/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp index e2022aba97c4..26f2db183fbf 100644 --- a/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp +++ b/llvm/lib/Transforms/Scalar/IVUsersPrinter.cpp @@ -8,7 +8,6 @@ #include "llvm/Transforms/Scalar/IVUsersPrinter.h" #include "llvm/Analysis/IVUsers.h" -#include "llvm/Support/Debug.h" using namespace llvm; #define DEBUG_TYPE "iv-users" diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index ceb03eb17f6d..e977dd18be9f 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -25,10 +25,7 @@ #include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -74,11 +71,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -387,7 +382,7 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get()); // Delete the old floating point increment. - Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); + Incr->replaceAllUsesWith(PoisonValue::get(Incr->getType())); RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get()); // If the FP induction variable still has uses, this is because something else @@ -605,10 +600,10 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L, Intrinsic::getName(Intrinsic::experimental_guard)); bool HasGuards = GuardDecl && !GuardDecl->use_empty(); - SmallVector<PHINode*, 8> LoopPhis; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - LoopPhis.push_back(cast<PHINode>(I)); - } + SmallVector<PHINode *, 8> LoopPhis; + for (PHINode &PN : L->getHeader()->phis()) + LoopPhis.push_back(&PN); + // Each round of simplification iterates through the SimplifyIVUsers worklist // for all current phis, then determines whether any IVs can be // widened. Widening adds new phis to LoopPhis, inducing another round of diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 0e5653eeb7d5..799669a19796 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -56,8 +56,6 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/BasicBlock.h" @@ -1411,12 +1409,12 @@ bool LoopConstrainer::run() { bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate; Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate); - if (!MaybeSR.hasValue()) { + if (!MaybeSR) { LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n"); return false; } - SubRanges SR = MaybeSR.getValue(); + SubRanges SR = *MaybeSR; bool Increasing = MainLoopStructure.IndVarIncreasing; IntegerType *IVTy = cast<IntegerType>(Range.getBegin()->getType()); @@ -1429,9 +1427,9 @@ bool LoopConstrainer::run() { // constructor. ClonedLoop PreLoop, PostLoop; bool NeedsPreLoop = - Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue(); + Increasing ? SR.LowLimit.has_value() : SR.HighLimit.has_value(); bool NeedsPostLoop = - Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue(); + Increasing ? SR.HighLimit.has_value() : SR.LowLimit.has_value(); Value *ExitPreLoopAt = nullptr; Value *ExitMainLoopAt = nullptr; @@ -1710,7 +1708,7 @@ IntersectSignedRange(ScalarEvolution &SE, const InductiveRangeCheck::Range &R2) { if (R2.isEmpty(SE, /* IsSigned */ true)) return None; - if (!R1.hasValue()) + if (!R1) return R2; auto &R1Value = R1.getValue(); // We never return empty ranges from this function, and R1 is supposed to be @@ -1739,7 +1737,7 @@ IntersectUnsignedRange(ScalarEvolution &SE, const InductiveRangeCheck::Range &R2) { if (R2.isEmpty(SE, /* IsSigned */ false)) return None; - if (!R1.hasValue()) + if (!R1) return R2; auto &R1Value = R1.getValue(); // We never return empty ranges from this function, and R1 is supposed to be @@ -1763,10 +1761,14 @@ IntersectUnsignedRange(ScalarEvolution &SE, } PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) { - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F); LoopInfo &LI = AM.getResult<LoopAnalysis>(F); + // There are no loops in the function. Return before computing other expensive + // analyses. + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F); // Get BFI analysis result on demand. Please note that modification of // CFG invalidates this analysis and we should handle it. @@ -1854,7 +1856,7 @@ InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L, LoopStructure &LS) { if (SkipProfitabilityChecks) return true; - if (GetBFI.hasValue()) { + if (GetBFI) { BlockFrequencyInfo &BFI = (*GetBFI)(); uint64_t hFreq = BFI.getBlockFreq(LS.Header).getFrequency(); uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency(); @@ -1920,12 +1922,12 @@ bool InductiveRangeCheckElimination::run( const char *FailureReason = nullptr; Optional<LoopStructure> MaybeLoopStructure = LoopStructure::parseLoopStructure(SE, *L, FailureReason); - if (!MaybeLoopStructure.hasValue()) { + if (!MaybeLoopStructure) { LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason << "\n";); return false; } - LoopStructure LS = MaybeLoopStructure.getValue(); + LoopStructure LS = *MaybeLoopStructure; if (!isProfitableToTransform(*L, LS)) return false; const SCEVAddRecExpr *IndVar = @@ -1946,10 +1948,10 @@ bool InductiveRangeCheckElimination::run( for (InductiveRangeCheck &IRC : RangeChecks) { auto Result = IRC.computeSafeIterationSpace(SE, IndVar, LS.IsSignedPredicate); - if (Result.hasValue()) { + if (Result) { auto MaybeSafeIterRange = IntersectRange(SE, SafeIterRange, Result.getValue()); - if (MaybeSafeIterRange.hasValue()) { + if (MaybeSafeIterRange) { assert( !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) && "We should never return empty ranges!"); @@ -1959,7 +1961,7 @@ bool InductiveRangeCheckElimination::run( } } - if (!SafeIterRange.hasValue()) + if (!SafeIterRange) return false; LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 8f5933b7bd71..5eefde2e37a1 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -92,8 +92,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/None.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AssumptionCache.h" @@ -182,7 +180,7 @@ public: class InferAddressSpacesImpl { AssumptionCache &AC; - DominatorTree *DT = nullptr; + const DominatorTree *DT = nullptr; const TargetTransformInfo *TTI = nullptr; const DataLayout *DL = nullptr; @@ -213,10 +211,11 @@ class InferAddressSpacesImpl { // Changes the flat address expressions in function F to point to specific // address spaces if InferredAddrSpace says so. Postorder is the postorder of // all flat expressions in the use-def graph of function F. - bool rewriteWithNewAddressSpaces( - const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, - const ValueToAddrSpaceMapTy &InferredAddrSpace, - const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const; + bool + rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, + const PredicatedAddrSpaceMapTy &PredicatedAS, + Function *F) const; void appendsFlatAddressExpressionToPostorderStack( Value *V, PostorderStackTy &PostorderStack, @@ -240,7 +239,7 @@ class InferAddressSpacesImpl { unsigned getPredicatedAddrSpace(const Value &V, Value *Opnd) const; public: - InferAddressSpacesImpl(AssumptionCache &AC, DominatorTree *DT, + InferAddressSpacesImpl(AssumptionCache &AC, const DominatorTree *DT, const TargetTransformInfo *TTI, unsigned FlatAddrSpace) : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {} bool run(Function &F); @@ -280,15 +279,15 @@ static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL, // arithmetic may also be undefined after invalid pointer reinterpret cast. // However, as we confirm through the target hooks that it's a no-op // addrspacecast, it doesn't matter since the bits should be the same. + unsigned P2IOp0AS = P2I->getOperand(0)->getType()->getPointerAddressSpace(); + unsigned I2PAS = I2P->getType()->getPointerAddressSpace(); return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()), I2P->getOperand(0)->getType(), I2P->getType(), DL) && CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()), P2I->getOperand(0)->getType(), P2I->getType(), DL) && - TTI->isNoopAddrSpaceCast( - P2I->getOperand(0)->getType()->getPointerAddressSpace(), - I2P->getType()->getPointerAddressSpace()); + (P2IOp0AS == I2PAS || TTI->isNoopAddrSpaceCast(P2IOp0AS, I2PAS)); } // Returns true if V is an address expression. @@ -332,8 +331,7 @@ getPointerOperands(const Value &V, const DataLayout &DL, switch (Op.getOpcode()) { case Instruction::PHI: { auto IncomingValues = cast<PHINode>(Op).incoming_values(); - return SmallVector<Value *, 2>(IncomingValues.begin(), - IncomingValues.end()); + return {IncomingValues.begin(), IncomingValues.end()}; } case Instruction::BitCast: case Instruction::AddrSpaceCast: @@ -655,10 +653,13 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace( case Instruction::IntToPtr: { assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI)); Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0); - assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); - if (Src->getType() != NewPtrType) - return new BitCastInst(Src, NewPtrType); - return Src; + if (Src->getType() == NewPtrType) + return Src; + + // If we had a no-op inttoptr/ptrtoint pair, we may still have inferred a + // source address space from a generic pointer source need to insert a cast + // back. + return CastInst::CreatePointerBitCastOrAddrSpaceCast(Src, NewPtrType); } default: llvm_unreachable("Unexpected opcode"); @@ -726,7 +727,7 @@ static Value *cloneConstantExprWithNewAddressSpace( NewOperands.push_back(cast<Constant>(NewOperand)); continue; } - if (auto CExpr = dyn_cast<ConstantExpr>(Operand)) + if (auto *CExpr = dyn_cast<ConstantExpr>(Operand)) if (Value *NewOperand = cloneConstantExprWithNewAddressSpace( CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) { IsNew = true; @@ -738,7 +739,7 @@ static Value *cloneConstantExprWithNewAddressSpace( } // If !IsNew, we will replace the Value with itself. However, replaced values - // are assumed to wrapped in a addrspace cast later so drop it now. + // are assumed to wrapped in an addrspacecast cast later so drop it now. if (!IsNew) return nullptr; @@ -821,8 +822,8 @@ bool InferAddressSpacesImpl::run(Function &F) { // Changes the address spaces of the flat address expressions who are inferred // to point to a specific address space. - return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, - PredicatedAS, &F); + return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, PredicatedAS, + &F); } // Constants need to be tracked through RAUW to handle cases with nested @@ -1010,7 +1011,7 @@ static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI, } /// Update memory intrinsic uses that require more complex processing than -/// simple memory instructions. Thse require re-mangling and may have multiple +/// simple memory instructions. These require re-mangling and may have multiple /// pointer operands. static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, Value *NewV) { @@ -1020,8 +1021,7 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); if (auto *MSI = dyn_cast<MemSetInst>(MI)) { - B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), - MaybeAlign(MSI->getDestAlignment()), + B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), MSI->getDestAlign(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { @@ -1104,7 +1104,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I, } bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( - const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder, + ArrayRef<WeakTrackingVH> Postorder, const ValueToAddrSpaceMapTy &InferredAddrSpace, const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const { // For each address expression to be modified, creates a clone of it with its @@ -1178,7 +1178,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( I = skipToNextUser(I, E); if (isSimplePointerUseValidToReplace( - TTI, U, V->getType()->getPointerAddressSpace())) { + *TTI, U, V->getType()->getPointerAddressSpace())) { // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. @@ -1239,8 +1239,16 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( if (!cast<PointerType>(ASC->getType()) ->hasSameElementTypeAs( cast<PointerType>(NewV->getType()))) { + BasicBlock::iterator InsertPos; + if (Instruction *NewVInst = dyn_cast<Instruction>(NewV)) + InsertPos = std::next(NewVInst->getIterator()); + else if (Instruction *VInst = dyn_cast<Instruction>(V)) + InsertPos = std::next(VInst->getIterator()); + else + InsertPos = ASC->getIterator(); + NewV = CastInst::Create(Instruction::BitCast, NewV, - ASC->getType(), "", ASC); + ASC->getType(), "", &*InsertPos); } ASC->replaceAllUsesWith(NewV); DeadInstructions.push_back(ASC); @@ -1249,12 +1257,18 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( } // Otherwise, replaces the use with flat(NewV). - if (Instruction *Inst = dyn_cast<Instruction>(V)) { + if (Instruction *VInst = dyn_cast<Instruction>(V)) { // Don't create a copy of the original addrspacecast. if (U == V && isa<AddrSpaceCastInst>(V)) continue; - BasicBlock::iterator InsertPos = std::next(Inst->getIterator()); + // Insert the addrspacecast after NewV. + BasicBlock::iterator InsertPos; + if (Instruction *NewVInst = dyn_cast<Instruction>(NewV)) + InsertPos = std::next(NewVInst->getIterator()); + else + InsertPos = std::next(VInst->getIterator()); + while (isa<PHINode>(InsertPos)) ++InsertPos; U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp index c11d2e4c1d6b..4644905adba3 100644 --- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -7,21 +7,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/InstSimplifyPass.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -55,7 +51,7 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ, DeadInstsInBB.push_back(&I); Changed = true; } else if (!I.use_empty()) { - if (Value *V = SimplifyInstruction(&I, SQ, ORE)) { + if (Value *V = simplifyInstruction(&I, SQ, ORE)) { // Mark all uses for resimplification next time round the loop. for (User *U : I.users()) Next->insert(cast<Instruction>(U)); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index a3efad104ca6..5caefc422921 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -56,7 +56,6 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" -#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -74,7 +73,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <cassert> -#include <cstddef> #include <cstdint> #include <iterator> #include <memory> @@ -106,11 +104,6 @@ static cl::opt<bool> PrintLVIAfterJumpThreading( cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), cl::Hidden); -static cl::opt<bool> JumpThreadingFreezeSelectCond( - "jump-threading-freeze-select-cond", - cl::desc("Freeze the condition when unfolding select"), cl::init(false), - cl::Hidden); - static cl::opt<bool> ThreadAcrossLoopHeaders( "jump-threading-across-loop-headers", cl::desc("Allow JumpThreading to thread across loop headers, for testing"), @@ -140,8 +133,7 @@ namespace { public: static char ID; // Pass identification - JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1) - : FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) { + JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } @@ -175,12 +167,11 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) // Public interface to the Jump Threading pass -FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) { - return new JumpThreading(InsertFr, Threshold); +FunctionPass *llvm::createJumpThreadingPass(int Threshold) { + return new JumpThreading(Threshold); } -JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) { - InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr; +JumpThreadingPass::JumpThreadingPass(int T) { DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); } @@ -326,7 +317,7 @@ bool JumpThreading::runOnFunction(Function &F) { std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; if (F.hasProfileData()) { - LoopInfo LI{DominatorTree(F)}; + LoopInfo LI{*DT}; BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } @@ -491,14 +482,16 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, // at the end of block. RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. -static void replaceFoldableUses(Instruction *Cond, Value *ToVal) { +static bool replaceFoldableUses(Instruction *Cond, Value *ToVal, + BasicBlock *KnownAtEndOfBB) { + bool Changed = false; assert(Cond->getType() == ToVal->getType()); - auto *BB = Cond->getParent(); // We can unconditionally replace all uses in non-local blocks (i.e. uses // strictly dominated by BB), since LVI information is true from the // terminator of BB. - replaceNonLocalUsesWith(Cond, ToVal); - for (Instruction &I : reverse(*BB)) { + if (Cond->getParent() == KnownAtEndOfBB) + Changed |= replaceNonLocalUsesWith(Cond, ToVal); + for (Instruction &I : reverse(*KnownAtEndOfBB)) { // Reached the Cond whose uses we are trying to replace, so there are no // more uses. if (&I == Cond) @@ -507,10 +500,13 @@ static void replaceFoldableUses(Instruction *Cond, Value *ToVal) { // of BB, where we know Cond is ToVal. if (!isGuaranteedToTransferExecutionToSuccessor(&I)) break; - I.replaceUsesOfWith(Cond, ToVal); + Changed |= I.replaceUsesOfWith(Cond, ToVal); } - if (Cond->use_empty() && !Cond->mayHaveSideEffects()) + if (Cond->use_empty() && !Cond->mayHaveSideEffects()) { Cond->eraseFromParent(); + Changed = true; + } + return Changed; } /// Return the cost of duplicating a piece of this block from first non-phi @@ -792,6 +788,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( if (Preference != WantInteger) return false; if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { + const DataLayout &DL = BO->getModule()->getDataLayout(); PredValueInfoTy LHSVals; computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals, WantInteger, RecursionSet, CxtI); @@ -799,7 +796,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Try to use constant folding to simplify the binary operator. for (const auto &LHSVal : LHSVals) { Constant *V = LHSVal.first; - Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); + Constant *Folded = + ConstantFoldBinaryOpOperands(BO->getOpcode(), V, CI, DL); if (Constant *KC = getKnownConstant(Folded, WantInteger)) Result.emplace_back(KC, LHSVal.second); @@ -835,7 +833,7 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( LHS = CmpLHS->DoPHITranslation(BB, PredBB); RHS = PN->getIncomingValue(i); } - Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); + Value *Res = simplifyCmpInst(Pred, LHS, RHS, {DL}); if (!Res) { if (!isa<Constant>(RHS)) continue; @@ -1135,34 +1133,21 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { return ConstantFolded; } - if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { + // Some of the following optimization can safely work on the unfrozen cond. + Value *CondWithoutFreeze = CondInst; + if (auto *FI = dyn_cast<FreezeInst>(CondInst)) + CondWithoutFreeze = FI->getOperand(0); + + if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondWithoutFreeze)) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. - // TODO: This should be extended to handle switches as well. - BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); - Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); - if (CondBr && CondConst) { - // We should have returned as soon as we turn a conditional branch to - // unconditional. Because its no longer interesting as far as jump - // threading is concerned. - assert(CondBr->isConditional() && "Threading on unconditional terminator"); - + if (Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1))) { LazyValueInfo::Tristate Ret = LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), - CondConst, CondBr, /*UseBlockValue=*/false); + CondConst, BB->getTerminator(), + /*UseBlockValue=*/false); if (Ret != LazyValueInfo::Unknown) { - unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0; - unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1; - BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove); - ToRemoveSucc->removePredecessor(BB, true); - BranchInst *UncondBr = - BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); - UncondBr->setDebugLoc(CondBr->getDebugLoc()); - ++NumFolds; - CondBr->eraseFromParent(); - if (CondCmp->use_empty()) - CondCmp->eraseFromParent(); // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This @@ -1170,17 +1155,11 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. - else if (CondCmp->getParent() == BB) { - auto *CI = Ret == LazyValueInfo::True ? - ConstantInt::getTrue(CondCmp->getType()) : - ConstantInt::getFalse(CondCmp->getType()); - replaceFoldableUses(CondCmp, CI); - } - DTU->applyUpdatesPermissive( - {{DominatorTree::Delete, BB, ToRemoveSucc}}); - if (HasProfileData) - BPI->eraseBlock(BB); - return true; + auto *CI = Ret == LazyValueInfo::True ? + ConstantInt::getTrue(CondCmp->getType()) : + ConstantInt::getFalse(CondCmp->getType()); + if (replaceFoldableUses(CondCmp, CI, BB)) + return true; } // We did not manage to simplify this branch, try to see whether @@ -1198,11 +1177,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // for loads that are used by a switch or by the condition for the branch. If // we see one, check to see if it's partially redundant. If so, insert a PHI // which can then be used to thread the values. - Value *SimplifyValue = CondInst; - - if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue)) - // Look into freeze's operand - SimplifyValue = FI->getOperand(0); + Value *SimplifyValue = CondWithoutFreeze; if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) if (isa<Constant>(CondCmp->getOperand(1))) @@ -1227,10 +1202,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) { // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in // the current block, see if we can simplify. - PHINode *PN = dyn_cast<PHINode>( - isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0) - : CondInst); - + PHINode *PN = dyn_cast<PHINode>(CondWithoutFreeze); if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return processBranchOnPHI(PN); @@ -1253,6 +1225,17 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { return false; Value *Cond = BI->getCondition(); + // Assuming that predecessor's branch was taken, if pred's branch condition + // (V) implies Cond, Cond can be either true, undef, or poison. In this case, + // freeze(Cond) is either true or a nondeterministic value. + // If freeze(Cond) has only one use, we can freely fold freeze(Cond) to true + // without affecting other instructions. + auto *FICond = dyn_cast<FreezeInst>(Cond); + if (FICond && FICond->hasOneUse()) + Cond = FICond->getOperand(0); + else + FICond = nullptr; + BasicBlock *CurrentBB = BB; BasicBlock *CurrentPred = BB->getSinglePredecessor(); unsigned Iter = 0; @@ -1269,6 +1252,15 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB; Optional<bool> Implication = isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue); + + // If the branch condition of BB (which is Cond) and CurrentPred are + // exactly the same freeze instruction, Cond can be folded into CondIsTrue. + if (!Implication && FICond && isa<FreezeInst>(PBI->getCondition())) { + if (cast<FreezeInst>(PBI->getCondition())->getOperand(0) == + FICond->getOperand(0)) + Implication = CondIsTrue; + } + if (Implication) { BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1); BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0); @@ -1277,6 +1269,9 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) { UncondBI->setDebugLoc(BI->getDebugLoc()); ++NumFolds; BI->eraseFromParent(); + if (FICond) + FICond->eraseFromParent(); + DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}}); if (HasProfileData) BPI->eraseBlock(BB); @@ -1338,10 +1333,10 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { combineMetadataForCSE(NLoadI, LoadI, false); }; - // If the returned value is the load itself, replace with an undef. This can + // If the returned value is the load itself, replace with poison. This can // only happen in dead loops. if (AvailableVal == LoadI) - AvailableVal = UndefValue::get(LoadI->getType()); + AvailableVal = PoisonValue::get(LoadI->getType()); if (AvailableVal->getType() != LoadI->getType()) AvailableVal = CastInst::CreateBitOrPointerCast( AvailableVal, LoadI->getType(), "", LoadI); @@ -1566,10 +1561,8 @@ findMostPopularDest(BasicBlock *BB, DestPopularity[PredToDest.second]++; // Find the most popular dest. - using VT = decltype(DestPopularity)::value_type; auto MostPopular = std::max_element( - DestPopularity.begin(), DestPopularity.end(), - [](const VT &L, const VT &R) { return L.second < R.second; }); + DestPopularity.begin(), DestPopularity.end(), llvm::less_second()); // Okay, we have finally picked the most popular destination. return MostPopular->first; @@ -1742,9 +1735,8 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. - else if (OnlyVal && OnlyVal != MultipleVal && - CondInst->getParent() == BB) - replaceFoldableUses(CondInst, OnlyVal); + else if (OnlyVal && OnlyVal != MultipleVal) + replaceFoldableUses(CondInst, OnlyVal, BB); } return true; } @@ -2672,7 +2664,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = SimplifyInstruction( + if (Value *IV = simplifyInstruction( New, {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { ValueMapping[&*BI] = IV; @@ -2912,9 +2904,7 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { continue; // Expand the select. Value *Cond = SI->getCondition(); - if (InsertFreezeWhenUnfoldingSelect && - !isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI, - &DTU->getDomTree())) + if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) Cond = new FreezeInst(Cond, "cond.fr", SI); Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false); BasicBlock *SplitBB = SI->getParent(); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 7fb1a25bdf13..492f4e40395a 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -37,29 +37,27 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LICM.h" +#include "llvm/ADT/PriorityWorklist.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -78,7 +76,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -88,6 +85,11 @@ #include <utility> using namespace llvm; +namespace llvm { +class BlockFrequencyInfo; +class LPMUpdater; +} // namespace llvm + #define DEBUG_TYPE "licm" STATISTIC(NumCreatedBlocks, "Number of blocks created"); @@ -114,8 +116,7 @@ static cl::opt<uint32_t> MaxNumUsesTraversed( // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to -// address the same issue. This flag applies only when LICM uses MemorySSA -// instead on AliasSetTracker. LICM calls MemorySSAWalker's +// address the same issue. LICM calls MemorySSAWalker's // getClobberingMemoryAccess, up to the value of the Cap, getting perfect // accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess, // which may not be precise, since optimizeUses is capped. The result is @@ -143,37 +144,32 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, bool LoopNestMode); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, const Loop *CurLoop, - ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU, OptimizationRemarkEmitter *ORE); -static bool isSafeToExecuteUnconditionally(Instruction &Inst, - const DominatorTree *DT, - const TargetLibraryInfo *TLI, - const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE, - const Instruction *CtxI = nullptr); -static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, - AliasSetTracker *CurAST, Loop *CurLoop, - AAResults *AA); -static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU, - Loop *CurLoop, Instruction &I, - SinkAndHoistLICMFlags &Flags); -static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA, - MemoryUse &MU); +static bool isSafeToExecuteUnconditionally( + Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, const Instruction *CtxI, + bool AllowSpeculation); +static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU, + Loop *CurLoop, Instruction &I, + SinkAndHoistLICMFlags &Flags); +static bool pointerInvalidatedByBlock(BasicBlock &BB, MemorySSA &MSSA, + MemoryUse &MU); static Instruction *cloneInstructionInExitBlock( Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, - const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU); + const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU); static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU); + MemorySSAUpdater &MSSAU); static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU, ScalarEvolution *SE); + MemorySSAUpdater &MSSAU, ScalarEvolution *SE); static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L, function_ref<void(Instruction *)> Fn); @@ -188,21 +184,26 @@ struct LoopInvariantCodeMotion { OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap) + unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) : LicmMssaOptCap(LicmMssaOptCap), - LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} + LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(LicmAllowSpeculation) {} private: unsigned LicmMssaOptCap; unsigned LicmMssaNoAccForPromotionCap; + bool LicmAllowSpeculation; }; struct LegacyLICMPass : public LoopPass { static char ID; // Pass identification, replacement for typeid LegacyLICMPass( unsigned LicmMssaOptCap = SetLicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap) - : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) { + unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation = true) + : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation) { initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry()); } @@ -265,7 +266,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); - LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap, + Opts.AllowSpeculation); if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE)) return PreservedAnalyses::all(); @@ -279,6 +281,16 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, return PA; } +void LICMPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LICMPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + + OS << "<"; + OS << (Opts.AllowSpeculation ? "" : "no-") << "allowspeculation"; + OS << ">"; +} + PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { @@ -290,7 +302,8 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(LN.getParent()); - LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap, + Opts.AllowSpeculation); Loop &OutermostLoop = LN.getOutermostLoop(); bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI, @@ -308,6 +321,16 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, return PA; } +void LNICMPass::printPipeline( + raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { + static_cast<PassInfoMixin<LNICMPass> *>(this)->printPipeline( + OS, MapClassName2PassName); + + OS << "<"; + OS << (Opts.AllowSpeculation ? "" : "no-") << "allowspeculation"; + OS << ">"; +} + char LegacyLICMPass::ID = 0; INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, false) @@ -321,8 +344,10 @@ INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, Pass *llvm::createLICMPass() { return new LegacyLICMPass(); } Pass *llvm::createLICMPass(unsigned LicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap) { - return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) { + return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation); } llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L, @@ -365,6 +390,7 @@ bool LoopInvariantCodeMotion::runOnLoop( bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); + MSSA->ensureOptimizedUses(); // If this loop has metadata indicating that LICM is not to be performed then // just exit. @@ -411,14 +437,15 @@ bool LoopInvariantCodeMotion::runOnLoop( if (L->hasDedicatedExits()) Changed |= LoopNestMode ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI, - DT, BFI, TLI, TTI, L, &MSSAU, + DT, BFI, TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE) : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, - TLI, TTI, L, &MSSAU, &SafetyInfo, Flags, ORE); + TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE); Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, - &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode); + MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, + LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -451,8 +478,7 @@ bool LoopInvariantCodeMotion::runOnLoop( PredIteratorCache PIC; // Promoting one set of accesses may make the pointers for another set - // loop invariant, so run this in a loop (with the MaybePromotable set - // decreasing in size over time). + // loop invariant, so run this in a loop. bool Promoted = false; bool LocalPromoted; do { @@ -460,8 +486,8 @@ bool LoopInvariantCodeMotion::runOnLoop( for (const SmallSetVector<Value *, 8> &PointerMustAliases : collectPromotionCandidates(MSSA, AA, L)) { LocalPromoted |= promoteLoopAccessesToScalars( - PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, - LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE); + PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, + DT, TLI, L, MSSAU, &SafetyInfo, ORE, LicmAllowSpeculation); } Promoted |= LocalPromoted; } while (LocalPromoted); @@ -502,17 +528,17 @@ bool LoopInvariantCodeMotion::runOnLoop( bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - Loop *CurLoop, MemorySSAUpdater *MSSAU, + Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && SafetyInfo != nullptr && "Unexpected input to sinkRegion."); - // We want to visit children before parents. We will enque all the parents + // We want to visit children before parents. We will enqueue all the parents // before their children in the worklist and process the worklist in reverse // order. SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop); @@ -550,8 +576,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, if (!I.mayHaveSideEffects() && isNotUsedOrFreeInLoop(I, LoopNestMode ? OutermostLoop : CurLoop, SafetyInfo, TTI, FreeInLoop, LoopNestMode) && - canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/nullptr, MSSAU, true, - &Flags, ORE)) { + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE)) { if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; @@ -564,14 +589,14 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, } } if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + MSSAU.getMemorySSA()->verifyMemorySSA(); return Changed; } bool llvm::sinkRegionForLoopNest( DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, + Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { bool Changed = false; @@ -600,7 +625,7 @@ private: LoopInfo *LI; DominatorTree *DT; Loop *CurLoop; - MemorySSAUpdater *MSSAU; + MemorySSAUpdater &MSSAU; // A map of blocks in the loop to the block their instructions will be hoisted // to. @@ -612,7 +637,7 @@ private: public: ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop, - MemorySSAUpdater *MSSAU) + MemorySSAUpdater &MSSAU) : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {} void registerPossiblyHoistableBranch(BranchInst *BI) { @@ -788,7 +813,7 @@ public: if (HoistTarget == InitialPreheader) { // Phis in the loop header now need to use the new preheader. InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc); - MSSAU->wireOldPredecessorsToNewImmediatePredecessor( + MSSAU.wireOldPredecessorsToNewImmediatePredecessor( HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget}); // The new preheader dominates the loop header. DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc); @@ -822,13 +847,14 @@ public: bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, Loop *CurLoop, - MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, - OptimizationRemarkEmitter *ORE, bool LoopNestMode) { + OptimizationRemarkEmitter *ORE, bool LoopNestMode, + bool AllowSpeculation) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && SafetyInfo != nullptr && "Unexpected input to hoistRegion."); ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU); @@ -873,11 +899,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU, - true, &Flags, ORE) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, - CurLoop->getLoopPreheader()->getTerminator())) { + CurLoop->getLoopPreheader()->getTerminator(), AllowSpeculation)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(&I); @@ -982,7 +1007,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, } } if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + MSSAU.getMemorySSA()->verifyMemorySSA(); // Now that we've finished hoisting make sure that LI and DT are still // valid. @@ -1083,30 +1108,19 @@ bool isHoistableAndSinkableInst(Instruction &I) { isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) || isa<InsertValueInst>(I) || isa<FreezeInst>(I)); } -/// Return true if all of the alias sets within this AST are known not to -/// contain a Mod, or if MSSA knows there are no MemoryDefs in the loop. -bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU, - const Loop *L) { - if (CurAST) { - for (AliasSet &AS : *CurAST) { - if (!AS.isForwardingAliasSet() && AS.isMod()) { - return false; - } - } - return true; - } else { /*MSSAU*/ - for (auto *BB : L->getBlocks()) - if (MSSAU->getMemorySSA()->getBlockDefs(BB)) - return false; - return true; - } +/// Return true if MSSA knows there are no MemoryDefs in the loop. +bool isReadOnly(const MemorySSAUpdater &MSSAU, const Loop *L) { + for (auto *BB : L->getBlocks()) + if (MSSAU.getMemorySSA()->getBlockDefs(BB)) + return false; + return true; } /// Return true if I is the only Instruction with a MemoryAccess in L. bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, - const MemorySSAUpdater *MSSAU) { + const MemorySSAUpdater &MSSAU) { for (auto *BB : L->getBlocks()) - if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) { + if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) { int NotAPhi = 0; for (const auto &Acc : *Accs) { if (isa<MemoryPhi>(&Acc)) @@ -1121,22 +1135,15 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, } bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, - Loop *CurLoop, AliasSetTracker *CurAST, - MemorySSAUpdater *MSSAU, + Loop *CurLoop, MemorySSAUpdater &MSSAU, bool TargetExecutesOncePerLoop, - SinkAndHoistLICMFlags *Flags, + SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { - assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) && - "Either AliasSetTracker or MemorySSA should be initialized."); - // If we don't understand the instruction, bail early. if (!isHoistableAndSinkableInst(I)) return false; - MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr; - if (MSSA) - assert(Flags != nullptr && "Flags cannot be null."); - + MemorySSA *MSSA = MSSAU.getMemorySSA(); // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) @@ -1156,13 +1163,8 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (isLoadInvariantInLoop(LI, DT, CurLoop)) return true; - bool Invalidated; - if (CurAST) - Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST, - CurLoop, AA); - else - Invalidated = pointerInvalidatedByLoopWithMSSA( - MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags); + bool Invalidated = pointerInvalidatedByLoop( + MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, Flags); // Check loop-invariant address because this may also be a sinkable load // whose address is not necessarily loop-invariant. if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand())) @@ -1210,24 +1212,17 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (AAResults::onlyAccessesArgPointees(Behavior)) { // TODO: expand to writeable arguments for (Value *Op : CI->args()) - if (Op->getType()->isPointerTy()) { - bool Invalidated; - if (CurAST) - Invalidated = pointerInvalidatedByLoop( - MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA); - else - Invalidated = pointerInvalidatedByLoopWithMSSA( + if (Op->getType()->isPointerTy() && + pointerInvalidatedByLoop( MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I, - *Flags); - if (Invalidated) - return false; - } + Flags)) + return false; return true; } // If this call only reads from memory and there are no writes to memory // in the loop, we can hoist or sink the call as appropriate. - if (isReadOnly(CurAST, MSSAU, CurLoop)) + if (isReadOnly(MSSAU, CurLoop)) return true; } @@ -1238,21 +1233,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, } else if (auto *FI = dyn_cast<FenceInst>(&I)) { // Fences alias (most) everything to provide ordering. For the moment, // just give up if there are any other memory operations in the loop. - if (CurAST) { - auto Begin = CurAST->begin(); - assert(Begin != CurAST->end() && "must contain FI"); - if (std::next(Begin) != CurAST->end()) - // constant memory for instance, TODO: handle better - return false; - auto *UniqueI = Begin->getUniqueInstruction(); - if (!UniqueI) - // other memory op, give up - return false; - (void)FI; // suppress unused variable warning - assert(UniqueI == FI && "AS must contain FI"); - return true; - } else // MSSAU - return isOnlyMemoryAccess(FI, CurLoop, MSSAU); + return isOnlyMemoryAccess(FI, CurLoop, MSSAU); } else if (auto *SI = dyn_cast<StoreInst>(&I)) { if (!SI->isUnordered()) return false; // Don't sink/hoist volatile or ordered atomic store! @@ -1262,68 +1243,54 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // load store promotion instead. TODO: We can extend this to cases where // there is exactly one write to the location and that write dominates an // arbitrary number of reads in the loop. - if (CurAST) { - auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI)); - - if (AS.isRef() || !AS.isMustAlias()) - // Quick exit test, handled by the full path below as well. - return false; - auto *UniqueI = AS.getUniqueInstruction(); - if (!UniqueI) - // other memory op, give up - return false; - assert(UniqueI == SI && "AS must contain SI"); + if (isOnlyMemoryAccess(SI, CurLoop, MSSAU)) return true; - } else { // MSSAU - if (isOnlyMemoryAccess(SI, CurLoop, MSSAU)) - return true; - // If there are more accesses than the Promotion cap or no "quota" to - // check clobber, then give up as we're not walking a list that long. - if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls()) - return false; - // If there are interfering Uses (i.e. their defining access is in the - // loop), or ordered loads (stored as Defs!), don't move this store. - // Could do better here, but this is conservatively correct. - // TODO: Cache set of Uses on the first walk in runOnLoop, update when - // moving accesses. Can also extend to dominating uses. - auto *SIMD = MSSA->getMemoryAccess(SI); - for (auto *BB : CurLoop->getBlocks()) - if (auto *Accesses = MSSA->getBlockAccesses(BB)) { - for (const auto &MA : *Accesses) - if (const auto *MU = dyn_cast<MemoryUse>(&MA)) { - auto *MD = MU->getDefiningAccess(); - if (!MSSA->isLiveOnEntryDef(MD) && - CurLoop->contains(MD->getBlock())) - return false; - // Disable hoisting past potentially interfering loads. Optimized - // Uses may point to an access outside the loop, as getClobbering - // checks the previous iteration when walking the backedge. - // FIXME: More precise: no Uses that alias SI. - if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU)) - return false; - } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) { - if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) { - (void)LI; // Silence warning. - assert(!LI->isUnordered() && "Expected unordered load"); + // If there are more accesses than the Promotion cap or no "quota" to + // check clobber, then give up as we're not walking a list that long. + if (Flags.tooManyMemoryAccesses() || Flags.tooManyClobberingCalls()) + return false; + // If there are interfering Uses (i.e. their defining access is in the + // loop), or ordered loads (stored as Defs!), don't move this store. + // Could do better here, but this is conservatively correct. + // TODO: Cache set of Uses on the first walk in runOnLoop, update when + // moving accesses. Can also extend to dominating uses. + auto *SIMD = MSSA->getMemoryAccess(SI); + for (auto *BB : CurLoop->getBlocks()) + if (auto *Accesses = MSSA->getBlockAccesses(BB)) { + for (const auto &MA : *Accesses) + if (const auto *MU = dyn_cast<MemoryUse>(&MA)) { + auto *MD = MU->getDefiningAccess(); + if (!MSSA->isLiveOnEntryDef(MD) && + CurLoop->contains(MD->getBlock())) + return false; + // Disable hoisting past potentially interfering loads. Optimized + // Uses may point to an access outside the loop, as getClobbering + // checks the previous iteration when walking the backedge. + // FIXME: More precise: no Uses that alias SI. + if (!Flags.getIsSink() && !MSSA->dominates(SIMD, MU)) + return false; + } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) { + if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) { + (void)LI; // Silence warning. + assert(!LI->isUnordered() && "Expected unordered load"); + return false; + } + // Any call, while it may not be clobbering SI, it may be a use. + if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) { + // Check if the call may read from the memory location written + // to by SI. Check CI's attributes and arguments; the number of + // such checks performed is limited above by NoOfMemAccTooLarge. + ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); + if (isModOrRefSet(MRI)) return false; - } - // Any call, while it may not be clobbering SI, it may be a use. - if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) { - // Check if the call may read from the memory location written - // to by SI. Check CI's attributes and arguments; the number of - // such checks performed is limited above by NoOfMemAccTooLarge. - ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); - if (isModOrRefSet(MRI)) - return false; - } } - } - auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); - Flags->incrementClobberingCalls(); - // If there are no clobbering Defs in the loop, store is safe to hoist. - return MSSA->isLiveOnEntryDef(Source) || - !CurLoop->contains(Source->getBlock()); - } + } + } + auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); + Flags.incrementClobberingCalls(); + // If there are no clobbering Defs in the loop, store is safe to hoist. + return MSSA->isLiveOnEntryDef(Source) || + !CurLoop->contains(Source->getBlock()); } assert(!I.mayReadOrWriteMemory() && "unhandled aliasing"); @@ -1421,7 +1388,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, static Instruction *cloneInstructionInExitBlock( Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, - const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) { + const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU) { Instruction *New; if (auto *CI = dyn_cast<CallInst>(&I)) { const auto &BlockColors = SafetyInfo->getBlockColors(); @@ -1457,16 +1424,16 @@ static Instruction *cloneInstructionInExitBlock( if (!I.getName().empty()) New->setName(I.getName() + ".le"); - if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) { + if (MSSAU.getMemorySSA()->getMemoryAccess(&I)) { // Create a new MemoryAccess and let MemorySSA set its defining access. - MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( + MemoryAccess *NewMemAcc = MSSAU.createMemoryAccessInBB( New, nullptr, New->getParent(), MemorySSA::Beginning); if (NewMemAcc) { if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc)) - MSSAU->insertDef(MemDef, /*RenameUses=*/true); + MSSAU.insertDef(MemDef, /*RenameUses=*/true); else { auto *MemUse = cast<MemoryUse>(NewMemAcc); - MSSAU->insertUse(MemUse, /*RenameUses=*/true); + MSSAU.insertUse(MemUse, /*RenameUses=*/true); } } } @@ -1492,25 +1459,22 @@ static Instruction *cloneInstructionInExitBlock( } static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU) { - if (MSSAU) - MSSAU->removeMemoryAccess(&I); + MemorySSAUpdater &MSSAU) { + MSSAU.removeMemoryAccess(&I); SafetyInfo.removeInstruction(&I); I.eraseFromParent(); } static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest.getParent()); I.moveBefore(&Dest); - if (MSSAU) - if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( - MSSAU->getMemorySSA()->getMemoryAccess(&I))) - MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), - MemorySSA::BeforeTerminator); + if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( + MSSAU.getMemorySSA()->getMemoryAccess(&I))) + MSSAU.moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::BeforeTerminator); if (SE) SE->forgetValue(&I); } @@ -1519,7 +1483,7 @@ static Instruction *sinkThroughTriviallyReplaceablePHI( PHINode *TPN, Instruction *I, LoopInfo *LI, SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater &MSSAU) { assert(isTriviallyReplaceablePHI(*TPN, *I) && "Expect only trivially replaceable PHI"); BasicBlock *ExitBlock = TPN->getParent(); @@ -1625,7 +1589,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, const Loop *CurLoop, - ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU, OptimizationRemarkEmitter *ORE) { bool Changed = false; LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); @@ -1642,7 +1606,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, continue; if (!DT->isReachableFromEntry(User->getParent())) { - U = UndefValue::get(I.getType()); + U = PoisonValue::get(I.getType()); Changed = true; continue; } @@ -1655,7 +1619,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // unreachable. BasicBlock *BB = PN->getIncomingBlock(U); if (!DT->isReachableFromEntry(BB)) { - U = UndefValue::get(I.getType()); + U = PoisonValue::get(I.getType()); Changed = true; continue; } @@ -1669,7 +1633,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // Split predecessors of the PHI so that we can make users trivially // replaceable. - splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU); + splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, &MSSAU); // Should rebuild the iterators, as they may be invalidated by // splitPredecessorsOfLoopExit(). @@ -1720,7 +1684,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); PN->replaceAllUsesWith(New); - eraseInstruction(*PN, *SafetyInfo, nullptr); + eraseInstruction(*PN, *SafetyInfo, MSSAU); Changed = true; } return Changed; @@ -1731,7 +1695,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, /// static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + MemorySSAUpdater &MSSAU, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": " << I << "\n"); @@ -1774,14 +1738,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, /// Only sink or hoist an instruction if it is not a trapping instruction, /// or if the instruction is known not to trap when moved to the preheader. /// or if it is a trapping instruction and is guaranteed to execute. -static bool isSafeToExecuteUnconditionally(Instruction &Inst, - const DominatorTree *DT, - const TargetLibraryInfo *TLI, - const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE, - const Instruction *CtxI) { - if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) +static bool isSafeToExecuteUnconditionally( + Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, const Instruction *CtxI, + bool AllowSpeculation) { + if (AllowSpeculation && isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) return true; bool GuaranteedToExecute = @@ -1809,7 +1771,7 @@ class LoopPromoter : public LoadAndStorePromoter { SmallVectorImpl<Instruction *> &LoopInsertPts; SmallVectorImpl<MemoryAccess *> &MSSAInsertPts; PredIteratorCache &PredCache; - MemorySSAUpdater *MSSAU; + MemorySSAUpdater &MSSAU; LoopInfo &LI; DebugLoc DL; Align Alignment; @@ -1841,7 +1803,7 @@ public: SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC, - MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, + MemorySSAUpdater &MSSAU, LoopInfo &li, DebugLoc dl, Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), @@ -1883,14 +1845,14 @@ public: MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i]; MemoryAccess *NewMemAcc; if (!MSSAInsertPoint) { - NewMemAcc = MSSAU->createMemoryAccessInBB( + NewMemAcc = MSSAU.createMemoryAccessInBB( NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning); } else { NewMemAcc = - MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint); + MSSAU.createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint); } MSSAInsertPts[i] = NewMemAcc; - MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); + MSSAU.insertDef(cast<MemoryDef>(NewMemAcc), true); // FIXME: true for safety, false may still be correct. } } @@ -1902,7 +1864,7 @@ public: void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); - MSSAU->removeMemoryAccess(I); + MSSAU.removeMemoryAccess(I); } bool shouldDelete(Instruction *I) const override { @@ -1948,8 +1910,8 @@ bool llvm::promoteLoopAccessesToScalars( SmallVectorImpl<Instruction *> &InsertPts, SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE) { + Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, bool AllowSpeculation) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -1997,6 +1959,7 @@ bool llvm::promoteLoopAccessesToScalars( bool DereferenceableInPH = false; bool SafeToInsertStore = false; + bool StoreIsGuanteedToExecute = false; bool FoundLoadToPromote = false; SmallVector<Instruction *, 64> LoopUses; @@ -2031,9 +1994,9 @@ bool llvm::promoteLoopAccessesToScalars( // different sizes. While we are at it, collect alignment and AA info. Type *AccessTy = nullptr; for (Value *ASIV : PointerMustAliases) { - for (User *U : ASIV->users()) { + for (Use &U : ASIV->uses()) { // Ignore instructions that are outside the loop. - Instruction *UI = dyn_cast<Instruction>(U); + Instruction *UI = dyn_cast<Instruction>(U.getUser()); if (!UI || !CurLoop->contains(UI)) continue; @@ -2054,16 +2017,16 @@ bool llvm::promoteLoopAccessesToScalars( // to execute does as well. Thus we can increase our guaranteed // alignment as well. if (!DereferenceableInPH || (InstAlignment > Alignment)) - if (isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop, - SafetyInfo, ORE, - Preheader->getTerminator())) { + if (isSafeToExecuteUnconditionally( + *Load, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AllowSpeculation)) { DereferenceableInPH = true; Alignment = std::max(Alignment, InstAlignment); } } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. - if (UI->getOperand(1) != ASIV) + if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) continue; if (!Store->isUnordered()) return false; @@ -2077,10 +2040,12 @@ bool llvm::promoteLoopAccessesToScalars( // alignment than any other guaranteed stores, in which case we can // raise the alignment on the promoted store. Align InstAlignment = Store->getAlign(); - + bool GuaranteedToExecute = + SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop); + StoreIsGuanteedToExecute |= GuaranteedToExecute; if (!DereferenceableInPH || !SafeToInsertStore || (InstAlignment > Alignment)) { - if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) { + if (GuaranteedToExecute) { DereferenceableInPH = true; SafeToInsertStore = true; Alignment = std::max(Alignment, InstAlignment); @@ -2194,32 +2159,37 @@ bool llvm::promoteLoopAccessesToScalars( // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. - LoadInst *PreheaderLoad = new LoadInst( - AccessTy, SomePtr, SomePtr->getName() + ".promoted", - Preheader->getTerminator()); - if (SawUnorderedAtomic) - PreheaderLoad->setOrdering(AtomicOrdering::Unordered); - PreheaderLoad->setAlignment(Alignment); - PreheaderLoad->setDebugLoc(DebugLoc()); - if (AATags) - PreheaderLoad->setAAMetadata(AATags); - SSA.AddAvailableValue(Preheader, PreheaderLoad); + LoadInst *PreheaderLoad = nullptr; + if (FoundLoadToPromote || !StoreIsGuanteedToExecute) { + PreheaderLoad = + new LoadInst(AccessTy, SomePtr, SomePtr->getName() + ".promoted", + Preheader->getTerminator()); + if (SawUnorderedAtomic) + PreheaderLoad->setOrdering(AtomicOrdering::Unordered); + PreheaderLoad->setAlignment(Alignment); + PreheaderLoad->setDebugLoc(DebugLoc()); + if (AATags) + PreheaderLoad->setAAMetadata(AATags); - MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( - PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); - MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); - MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); + MemoryAccess *PreheaderLoadMemoryAccess = MSSAU.createMemoryAccessInBB( + PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); + MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); + MSSAU.insertUse(NewMemUse, /*RenameUses=*/true); + SSA.AddAvailableValue(Preheader, PreheaderLoad); + } else { + SSA.AddAvailableValue(Preheader, PoisonValue::get(AccessTy)); + } if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + MSSAU.getMemorySSA()->verifyMemorySSA(); // Rewrite all the loads in the loop and remember all the definitions from // stores in the loop. Promoter.run(LoopUses); if (VerifyMemorySSA) - MSSAU->getMemorySSA()->verifyMemorySSA(); + MSSAU.getMemorySSA()->verifyMemorySSA(); // If the SSAUpdater didn't use the load in the preheader, just zap it now. - if (PreheaderLoad->use_empty()) + if (PreheaderLoad && PreheaderLoad->use_empty()) eraseInstruction(*PreheaderLoad, *SafetyInfo, MSSAU); return true; @@ -2246,8 +2216,7 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) { return false; }; - // Populate AST with potentially promotable accesses and remove them from - // MaybePromotable, so they will not be checked again on the next iteration. + // Populate AST with potentially promotable accesses. SmallPtrSet<Value *, 16> AttemptingPromotion; foreachMemoryAccess(MSSA, L, [&](Instruction *I) { if (IsPotentiallyPromotable(I)) { @@ -2286,15 +2255,9 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) { return Result; } -static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, - AliasSetTracker *CurAST, Loop *CurLoop, - AAResults *AA) { - return CurAST->getAliasSetFor(MemLoc).isMod(); -} - -bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU, - Loop *CurLoop, Instruction &I, - SinkAndHoistLICMFlags &Flags) { +static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU, + Loop *CurLoop, Instruction &I, + SinkAndHoistLICMFlags &Flags) { // For hoisting, use the walker to determine safety if (!Flags.getIsSink()) { MemoryAccess *Source; @@ -2329,17 +2292,16 @@ bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU, if (Flags.tooManyMemoryAccesses()) return true; for (auto *BB : CurLoop->getBlocks()) - if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU)) + if (pointerInvalidatedByBlock(*BB, *MSSA, *MU)) return true; // When sinking, the source block may not be part of the loop so check it. if (!CurLoop->contains(&I)) - return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU); + return pointerInvalidatedByBlock(*I.getParent(), *MSSA, *MU); return false; } -bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA, - MemoryUse &MU) { +bool pointerInvalidatedByBlock(BasicBlock &BB, MemorySSA &MSSA, MemoryUse &MU) { if (const auto *Accesses = MSSA.getBlockDefs(&BB)) for (const auto &MA : *Accesses) if (const auto *MD = dyn_cast<MemoryDef>(&MA)) diff --git a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp index 1c3ff1a61b7e..c063c0d3c88a 100644 --- a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp +++ b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" using namespace llvm; #define DEBUG_TYPE "loop-accesses" diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index d438d56e38ca..2b9800f11912 100644 --- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -8,20 +8,15 @@ #include "llvm/Transforms/Scalar/LoopBoundSplit.h" #include "llvm/ADT/Sequence.h" -#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/MemorySSA.h" -#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #define DEBUG_TYPE "loop-bound-split" @@ -33,26 +28,23 @@ using namespace PatternMatch; namespace { struct ConditionInfo { /// Branch instruction with this condition - BranchInst *BI; + BranchInst *BI = nullptr; /// ICmp instruction with this condition - ICmpInst *ICmp; + ICmpInst *ICmp = nullptr; /// Preciate info - ICmpInst::Predicate Pred; + ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; /// AddRec llvm value - Value *AddRecValue; + Value *AddRecValue = nullptr; /// Non PHI AddRec llvm value Value *NonPHIAddRecValue; /// Bound llvm value - Value *BoundValue; + Value *BoundValue = nullptr; /// AddRec SCEV - const SCEVAddRecExpr *AddRecSCEV; + const SCEVAddRecExpr *AddRecSCEV = nullptr; /// Bound SCEV - const SCEV *BoundSCEV; + const SCEV *BoundSCEV = nullptr; - ConditionInfo() - : BI(nullptr), ICmp(nullptr), Pred(ICmpInst::BAD_ICMP_PREDICATE), - AddRecValue(nullptr), BoundValue(nullptr), AddRecSCEV(nullptr), - BoundSCEV(nullptr) {} + ConditionInfo() = default; }; } // namespace diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 57e36e5b9b90..9590fbbb1994 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" @@ -30,9 +29,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "loop-data-prefetch" @@ -236,15 +233,14 @@ struct Prefetch { /// The address formula for this prefetch as returned by ScalarEvolution. const SCEVAddRecExpr *LSCEVAddRec; /// The point of insertion for the prefetch instruction. - Instruction *InsertPt; + Instruction *InsertPt = nullptr; /// True if targeting a write memory access. - bool Writes; + bool Writes = false; /// The (first seen) prefetched instruction. - Instruction *MemI; + Instruction *MemI = nullptr; /// Constructor to create a new Prefetch for \p I. - Prefetch(const SCEVAddRecExpr *L, Instruction *I) - : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) { + Prefetch(const SCEVAddRecExpr *L, Instruction *I) : LSCEVAddRec(L) { addInstruction(I); }; @@ -303,7 +299,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } Metrics.analyzeBasicBlock(BB, *TTI, EphValues); } - unsigned LoopSize = Metrics.NumInsts; + + if (!Metrics.NumInsts.isValid()) + return MadeChange; + + unsigned LoopSize = *Metrics.NumInsts.getValue(); if (!LoopSize) LoopSize = 1; diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 361d6c0d9381..93f3cd704196 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,12 +17,12 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" @@ -192,13 +192,13 @@ getValueOnFirstIteration(Value *V, DenseMap<Value *, Value *> &FirstIterValue, getValueOnFirstIteration(BO->getOperand(0), FirstIterValue, SQ); Value *RHS = getValueOnFirstIteration(BO->getOperand(1), FirstIterValue, SQ); - FirstIterV = SimplifyBinOp(BO->getOpcode(), LHS, RHS, SQ); + FirstIterV = simplifyBinOp(BO->getOpcode(), LHS, RHS, SQ); } else if (auto *Cmp = dyn_cast<ICmpInst>(V)) { Value *LHS = getValueOnFirstIteration(Cmp->getOperand(0), FirstIterValue, SQ); Value *RHS = getValueOnFirstIteration(Cmp->getOperand(1), FirstIterValue, SQ); - FirstIterV = SimplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ); + FirstIterV = simplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ); } else if (auto *Select = dyn_cast<SelectInst>(V)) { Value *Cond = getValueOnFirstIteration(Select->getCondition(), FirstIterValue, SQ); @@ -458,13 +458,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, if (ExitBlock && isLoopNeverExecuted(L)) { LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); // We need to forget the loop before setting the incoming values of the exit - // phis to undef, so we properly invalidate the SCEV expressions for those + // phis to poison, so we properly invalidate the SCEV expressions for those // phis. SE.forgetLoop(L); - // Set incoming value to undef for phi nodes in the exit block. + // Set incoming value to poison for phi nodes in the exit block. for (PHINode &P : ExitBlock->phis()) { std::fill(P.incoming_values().begin(), P.incoming_values().end(), - UndefValue::get(P.getType())); + PoisonValue::get(P.getType())); } ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(), diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 0f4c767c1e4c..03a10cb36bb6 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -47,7 +47,6 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" @@ -231,7 +230,7 @@ public: // having to update as many def-use and use-def chains. for (auto *Inst : reverse(Unused)) { if (!Inst->use_empty()) - Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); Inst->eraseFromParent(); } } @@ -601,7 +600,7 @@ private: {LLVMLoopDistributeFollowupAll, Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential : LLVMLoopDistributeFollowupCoincident}); - if (PartitionID.hasValue()) { + if (PartitionID) { Loop *NewLoop = Part->getDistributedLoop(); NewLoop->setLoopID(PartitionID.getValue()); } @@ -770,19 +769,19 @@ public: // Don't distribute the loop if we need too many SCEV run-time checks, or // any if it's illegal. - const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); + const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { return fail("RuntimeCheckWithConvergent", "may not insert runtime check with convergent operation"); } - if (Pred.getComplexity() > (IsForced.getValueOr(false) + if (Pred.getComplexity() > (IsForced.value_or(false) ? PragmaDistributeSCEVCheckThreshold : DistributeSCEVCheckThreshold)) return fail("TooManySCEVRuntimeChecks", "too many SCEV run-time checks needed.\n"); - if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L)) + if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) return fail("HeuristicDisabled", "distribution heuristic disabled"); LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); @@ -859,7 +858,7 @@ public: /// Provide diagnostics then \return with false. bool fail(StringRef RemarkName, StringRef Message) { LLVMContext &Ctx = F->getContext(); - bool Forced = isForced().getValueOr(false); + bool Forced = isForced().value_or(false); LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); @@ -991,7 +990,7 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, // If distribution was forced for the specific loop to be // enabled/disabled, follow that. Otherwise use the global flag. - if (LDL.isForced().getValueOr(EnableLoopDistribute)) + if (LDL.isForced().value_or(EnableLoopDistribute)) Changed |= LDL.processLoop(GetLAA); } diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index c46db4e63bfe..f36193fc468e 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -64,12 +65,12 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -210,8 +211,9 @@ struct FlattenInfo { if (!MatchedItCount) return false; - // Look through extends if the IV has been widened. - if (Widened && + // Look through extends if the IV has been widened. Don't look through + // extends if we already looked through a trunc. + if (Widened && IsAdd && (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { assert(MatchedItCount->getType() == InnerInductionPHI->getType() && "Unexpected type mismatch in types after widening"); @@ -410,7 +412,7 @@ static bool findLoopComponents( // pre-header and one from the latch. The incoming latch value is the // increment variable. Increment = - dyn_cast<BinaryOperator>(InductionPHI->getIncomingValueForBlock(Latch)); + cast<BinaryOperator>(InductionPHI->getIncomingValueForBlock(Latch)); if (Increment->hasNUsesOrMore(3)) { LLVM_DEBUG(dbgs() << "Could not find valid increment\n"); return false; @@ -921,7 +923,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + MSSAU ? MSSAU.getPointer() : nullptr); if (!Changed) return PreservedAnalyses::all(); @@ -987,7 +989,7 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) { for (Loop *L : *LI) { auto LN = LoopNest::getLoopNest(*L, *SE); Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + MSSAU ? MSSAU.getPointer() : nullptr); } return Changed; } diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index bf4d275e04ba..d94b767c7b63 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -117,7 +117,7 @@ static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( "Use the dependence analysis interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", "Use all available analyses")), - cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); + cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL)); static cl::opt<unsigned> FusionPeelMaxCount( "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, @@ -128,7 +128,7 @@ static cl::opt<unsigned> FusionPeelMaxCount( static cl::opt<bool> VerboseFusionDebugging("loop-fusion-verbose-debug", cl::desc("Enable verbose debugging for Loop Fusion"), - cl::Hidden, cl::init(false), cl::ZeroOrMore); + cl::Hidden, cl::init(false)); #endif namespace { @@ -178,12 +178,12 @@ struct FusionCandidate { /// FusionCandidateCompare function, required by FusionCandidateSet to /// determine where the FusionCandidate should be inserted into the set. These /// are used to establish ordering of the FusionCandidates based on dominance. - const DominatorTree *DT; + DominatorTree &DT; const PostDominatorTree *PDT; OptimizationRemarkEmitter &ORE; - FusionCandidate(Loop *L, const DominatorTree *DT, + FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), @@ -192,7 +192,6 @@ struct FusionCandidate { GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { - assert(DT && "Expected non-null DT!"); // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect // the memory reads and writes If any instructions that prevent fusion are @@ -391,7 +390,7 @@ struct FusionCandidateCompare { /// IF RHS dominates LHS and LHS post-dominates RHS, return false; bool operator()(const FusionCandidate &LHS, const FusionCandidate &RHS) const { - const DominatorTree *DT = LHS.DT; + const DominatorTree *DT = &(LHS.DT); BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); @@ -646,7 +645,7 @@ private: for (Loop *L : LV) { TTI::PeelingPreferences PP = gatherPeelingPreferences(L, SE, TTI, None, None); - FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP); + FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); if (!CurrCand.isEligibleForFusion(SE)) continue; @@ -991,7 +990,7 @@ private: FuseCounter); FusionCandidate FusedCand( - performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE, + performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE, FC0Copy.PP); FusedCand.verify(); assert(FusedCand.isEligibleForFusion(SE) && diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 318c4c06f0f7..88d6a7aff3c9 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -61,7 +61,6 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -346,7 +345,7 @@ INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } static void deleteDeadInstruction(Instruction *I) { - I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); I->eraseFromParent(); } @@ -798,7 +797,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, } /// processLoopMemIntrinsic - Template function for calling different processor -/// functions based on mem instrinsic type. +/// functions based on mem intrinsic type. template <typename MemInst> bool LoopIdiomRecognize::processLoopMemIntrinsic( BasicBlock *BB, @@ -995,9 +994,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet<Instruction *, 1> MSIs; MSIs.insert(MSI); return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()), - MaybeAlign(MSI->getDestAlignment()), - SplatValue, MSI, MSIs, Ev, BECount, - IsNegStride, /*IsLoopMemset=*/true); + MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev, + BECount, IsNegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -1101,6 +1099,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { + Module *M = TheStore->getModule(); Value *SplatValue = isBytewiseValue(StoredVal, *DL); Constant *PatternValue = nullptr; @@ -1173,6 +1172,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( CallInst *NewCall; if (SplatValue) { AAMDNodes AATags = TheStore->getAAMetadata(); + for (Instruction *Store : Stores) + AATags = AATags.merge(Store->getAAMetadata()); if (auto CI = dyn_cast<ConstantInt>(NumBytes)) AATags = AATags.extendTo(CI->getZExtValue()); else @@ -1181,15 +1182,14 @@ bool LoopIdiomRecognize::processLoopStridedStore( NewCall = Builder.CreateMemSet( BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); - } else { + } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; - Module *M = TheStore->getModule(); StringRef FuncName = "memset_pattern16"; - FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntIdxTy); - inferLibFuncAttributes(M, FuncName, *TLI); + FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, + Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); + inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1200,7 +1200,9 @@ bool LoopIdiomRecognize::processLoopStridedStore( GV->setAlignment(Align(16)); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - } + } else + return Changed; + NewCall->setDebugLoc(TheStore->getDebugLoc()); if (MSSAU) { @@ -1275,9 +1277,8 @@ class MemmoveVerifier { public: explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, const DataLayout &DL) - : DL(DL), LoadOff(0), StoreOff(0), - BP1(llvm::GetPointerBaseWithConstantOffset( - LoadBasePtr.stripPointerCasts(), LoadOff, DL)), + : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset( + LoadBasePtr.stripPointerCasts(), LoadOff, DL)), BP2(llvm::GetPointerBaseWithConstantOffset( StoreBasePtr.stripPointerCasts(), StoreOff, DL)), IsSameObject(BP1 == BP2) {} @@ -1307,8 +1308,8 @@ public: private: const DataLayout &DL; - int64_t LoadOff; - int64_t StoreOff; + int64_t LoadOff = 0; + int64_t StoreOff = 0; const Value *BP1; const Value *BP2; @@ -1420,26 +1421,19 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // If the store is a memcpy instruction, we must check if it will write to // the load memory locations. So remove it from the ignored stores. - if (IsMemCpy) - IgnoredInsts.erase(TheStore); MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL); + if (IsMemCpy && !Verifier.IsSameObject) + IgnoredInsts.erase(TheStore); if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, StoreSizeSCEV, *AA, IgnoredInsts)) { - if (!IsMemCpy) { - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", - TheLoad) - << ore::NV("Inst", InstRemark) << " in " - << ore::NV("Function", TheStore->getFunction()) - << " function will not be hoisted: " - << ore::NV("Reason", "The loop may access load location"); - }); - return Changed; - } - // At this point loop may access load only for memcpy in same underlying - // object. If that's not the case bail out. - if (!Verifier.IsSameObject) - return Changed; + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) + << ore::NV("Inst", InstRemark) << " in " + << ore::NV("Function", TheStore->getFunction()) + << " function will not be hoisted: " + << ore::NV("Reason", "The loop may access load location"); + }); + return Changed; } bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore; @@ -1487,7 +1481,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( return Changed; // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. - assert((StoreAlign.hasValue() && LoadAlign.hasValue()) && + assert((StoreAlign && LoadAlign) && "Expect unordered load/store to have align."); if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize) return Changed; diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index b9e63a4bc06f..4249512ea0f8 100644 --- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopInstSimplify.h" -#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -25,21 +24,17 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/User.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include <algorithm> #include <utility> using namespace llvm; @@ -101,7 +96,7 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, if (!IsFirstIteration && !ToSimplify->count(&I)) continue; - Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I)); + Value *V = simplifyInstruction(&I, SQ.getWithInstruction(&I)); if (!V || !LI.replacementPreservesLCSSAForm(&I, V)) continue; @@ -109,6 +104,10 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI, auto *UserI = cast<Instruction>(U.getUser()); U.set(V); + // Do not bother dealing with unreachable code. + if (!DT.isReachableFromEntry(UserI->getParent())) + continue; + // If the instruction is used by a PHI node we have already processed // we'll need to iterate on the loop body to converge, so add it to // the next set. @@ -222,7 +221,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) + MSSAU ? MSSAU.getPointer() : nullptr)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index c2b065c4eb31..1d3023d04463 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" @@ -33,7 +34,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -44,7 +44,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> @@ -120,8 +119,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, std::vector<char> Dep; Instruction *Src = cast<Instruction>(*I); Instruction *Dst = cast<Instruction>(*J); - if (Src == Dst) - continue; // Ignore Input dependencies. if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) continue; @@ -270,26 +267,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, return true; } -static LoopVector populateWorklist(Loop &L) { +static void populateWorklist(Loop &L, LoopVector &LoopList) { LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " << L.getHeader()->getParent()->getName() << " Loop: %" << L.getHeader()->getName() << '\n'); - LoopVector LoopList; + assert(LoopList.empty() && "LoopList should initially be empty!"); Loop *CurrentLoop = &L; const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); while (!Vec->empty()) { // The current loop has multiple subloops in it hence it is not tightly // nested. // Discard all loops above it added into Worklist. - if (Vec->size() != 1) - return {}; + if (Vec->size() != 1) { + LoopList = {}; + return; + } LoopList.push_back(CurrentLoop); CurrentLoop = Vec->front(); Vec = &CurrentLoop->getSubLoops(); } LoopList.push_back(CurrentLoop); - return LoopList; + return; } namespace { @@ -360,8 +359,10 @@ public: : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. - bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix); + bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, + unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix, + const DenseMap<const Loop *, unsigned> &CostMap); private: int getInstrOrderCost(); @@ -412,23 +413,26 @@ struct LoopInterchange { LoopInfo *LI = nullptr; DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; + std::unique_ptr<CacheCost> CC = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, - DominatorTree *DT, OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {} + DominatorTree *DT, std::unique_ptr<CacheCost> &CC, + OptimizationRemarkEmitter *ORE) + : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) return false; - - return processLoopList(populateWorklist(*L)); + SmallVector<Loop *, 8> LoopList; + populateWorklist(*L, LoopList); + return processLoopList(LoopList); } bool run(LoopNest &LN) { - const auto &LoopList = LN.getLoops(); + SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end()); for (unsigned I = 1; I < LoopList.size(); ++I) if (LoopList[I]->getParentLoop() != LoopList[I - 1]) return false; @@ -460,7 +464,7 @@ struct LoopInterchange { return LoopList.size() - 1; } - bool processLoopList(ArrayRef<Loop *> LoopList) { + bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { bool Changed = false; unsigned LoopNestDepth = LoopList.size(); if (LoopNestDepth < 2) { @@ -500,27 +504,55 @@ struct LoopInterchange { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best possible position. - Loop *LoopToBeInterchanged = LoopList[SelecLoopId]; - for (unsigned i = SelecLoopId; i > 0; i--) { - bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i, - i - 1, DependencyMatrix); - if (!Interchanged) - return Changed; - // Update the DependencyMatrix - interChangeDependencies(DependencyMatrix, i, i - 1); + // Obtain the loop vector returned from loop cache analysis beforehand, + // and put each <Loop, index> pair into a map for constant time query + // later. Indices in loop vector reprsent the optimal order of the + // corresponding loop, e.g., given a loopnest with depth N, index 0 + // indicates the loop should be placed as the outermost loop and index N + // indicates the loop should be placed as the innermost loop. + // + // For the old pass manager CacheCost would be null. + DenseMap<const Loop *, unsigned> CostMap; + if (CC != nullptr) { + const auto &LoopCosts = CC->getLoopCosts(); + for (unsigned i = 0; i < LoopCosts.size(); i++) { + CostMap[LoopCosts[i].first] = i; + } + } + // We try to achieve the globally optimal memory access for the loopnest, + // and do interchange based on a bubble-sort fasion. We start from + // the innermost loop, move it outwards to the best possible position + // and repeat this process. + for (unsigned j = SelecLoopId; j > 0; j--) { + bool ChangedPerIter = false; + for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { + bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, + DependencyMatrix, CostMap); + if (!Interchanged) + continue; + // Loops interchanged, update LoopList accordingly. + std::swap(LoopList[i - 1], LoopList[i]); + // Update the DependencyMatrix + interChangeDependencies(DependencyMatrix, i, i - 1); #ifdef DUMP_DEP_MATRICIES - LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); - printDepMatrix(DependencyMatrix); + LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); + printDepMatrix(DependencyMatrix); #endif - Changed |= Interchanged; + ChangedPerIter |= Interchanged; + Changed |= Interchanged; + } + // Early abort if there was no interchange during an entire round of + // moving loops outwards. + if (!ChangedPerIter) + break; } return Changed; } bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - std::vector<std::vector<char>> &DependencyMatrix) { + std::vector<std::vector<char>> &DependencyMatrix, + const DenseMap<const Loop *, unsigned> &CostMap) { LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); @@ -530,7 +562,8 @@ struct LoopInterchange { } LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); - if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, + DependencyMatrix, CostMap)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -733,8 +766,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) { if (PHI->getNumIncomingValues() == 1) continue; RecurrenceDescriptor RD; - if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { + // Detect floating point reduction only when it can be reordered. + if (RD.getExactFPMathInst() != nullptr) + return nullptr; return PHI; + } return nullptr; } } @@ -893,28 +930,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { - // FIXME: We currently are not able to detect floating point reductions - // and have to use floating point PHIs as a proxy to prevent - // interchanging in the presence of floating point reductions. - if (PHI.getType()->isFloatingPointTy()) - return false; for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { - Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); - if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) - continue; + Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; - // The incoming value is defined in the outer loop latch. Currently we - // only support that in case the outer loop latch has a single predecessor. - // This guarantees that the outer loop latch is executed if and only if - // the inner loop is executed (because tightlyNested() guarantees that the - // outer loop header only branches to the inner loop or the outer loop - // latch). - // FIXME: We could weaken this logic and allow multiple predecessors, - // if the values are produced outside the loop latch. We would need - // additional logic to update the PHI nodes in the exit block as - // well. - if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) - return false; + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; } } return true; @@ -1125,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, return !DepMatrix.empty(); } -bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Add better profitability checks. - // e.g - // 1) Construct dependency matrix and move the one with no loop carried dep - // inside to enable vectorization. +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap<const Loop *, unsigned> &CostMap) { + // TODO: Remove the legacy cost model. - // This is rough cost estimation algorithm. It counts the good and bad order - // of induction variables in the instruction and allows reordering if number - // of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + // This is the new cost model returned from loop cache analysis. + // A smaller index means the loop should be placed an outer loop, and vice + // versa. + if (CostMap.find(InnerLoop) != CostMap.end() && + CostMap.find(OuterLoop) != CostMap.end()) { + unsigned InnerIndex = 0, OuterIndex = 0; + InnerIndex = CostMap.find(InnerLoop)->second; + OuterIndex = CostMap.find(OuterLoop)->second; + LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex + << ", OuterIndex = " << OuterIndex << "\n"); + if (InnerIndex < OuterIndex) + return true; + } else { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < -LoopInterchangeCostThreshold) + return true; + } // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. @@ -1150,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Interchanging loops is too costly (cost=" - << ore::NV("Cost", Cost) << ", threshold=" - << ore::NV("Threshold", LoopInterchangeCostThreshold) - << ") and it does not improve parallelism."; + << "Interchanging loops is too costly and it does not improve " + "parallelism."; }); return false; } @@ -1424,9 +1466,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, // Incoming values are guaranteed be instructions currently. auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); + // In case of multi-level nested loops, follow LCSSA to find the incoming + // value defined from the innermost loop. + auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI)); // Skip phis with incoming values from the inner loop body, excluding the // header and latch. - if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader) + if (IncIInnerMost->getParent() != InnerLatch && + IncIInnerMost->getParent() != InnerHeader) continue; assert(all_of(P.users(), @@ -1695,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass { auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - return LoopInterchange(SE, LI, DI, DT, ORE).run(L); + std::unique_ptr<CacheCost> CC = nullptr; + return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L); } }; } // namespace @@ -1723,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, Function &F = *LN.getParent(); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + std::unique_ptr<CacheCost> CC = + CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); OptimizationRemarkEmitter ORE(&F); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 21d59936616b..1877ac1dfd08 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -61,7 +61,6 @@ #include <algorithm> #include <cassert> #include <forward_list> -#include <set> #include <tuple> #include <utility> @@ -213,7 +212,8 @@ public: continue; // Only progagate the value if they are of the same type. - if (Store->getPointerOperandType() != Load->getPointerOperandType()) + if (Store->getPointerOperandType() != Load->getPointerOperandType() || + getLoadStoreType(Store) != getLoadStoreType(Load)) continue; Candidates.emplace_front(Load, Store); @@ -528,7 +528,7 @@ public: return false; } - if (LAI.getPSE().getUnionPredicate().getComplexity() > + if (LAI.getPSE().getPredicate().getComplexity() > LoadElimSCEVCheckThreshold) { LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); return false; @@ -539,7 +539,7 @@ public: return false; } - if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { + if (!Checks.empty() || !LAI.getPSE().getPredicate().isAlwaysTrue()) { if (LAI.hasConvergentOp()) { LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with " "convergent calls\n"); @@ -706,8 +706,12 @@ FunctionPass *llvm::createLoopLoadEliminationPass() { PreservedAnalyses LoopLoadEliminationPass::run(Function &F, FunctionAnalysisManager &AM) { - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); + // There are no loops in the function. Return before computing other expensive + // analyses. + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 6c783848432b..d20d275ea60c 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -8,14 +8,12 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemorySSA.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Support/Debug.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/TimeProfiler.h" using namespace llvm; @@ -311,12 +309,12 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, #ifndef NDEBUG // LoopAnalysisResults should always be valid. - // Note that we don't LAR.SE.verify() because that can change observed SE - // queries. See PR44815. if (VerifyDomInfo) LAR.DT.verify(); if (VerifyLoopInfo) LAR.LI.verify(LAR.DT); + if (VerifySCEV) + LAR.SE.verify(); if (LAR.MSSA && VerifyMemorySSA) LAR.MSSA->verifyMemorySSA(); #endif diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp index aa7e79a589f2..d0ee5b47a8ca 100644 --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -188,7 +188,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" @@ -244,7 +243,7 @@ struct LoopICmp { LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV, const SCEV *Limit) : Pred(Pred), IV(IV), Limit(Limit) {} - LoopICmp() {} + LoopICmp() = default; void dump() { dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV << ", Limit = " << *Limit << "\n"; @@ -778,7 +777,7 @@ unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks, if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Guard)) { - Checks.push_back(NewRangeCheck.getValue()); + Checks.push_back(*NewRangeCheck); NumWidened++; continue; } diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 9d22eceb987f..f4ef22562341 100644 --- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -29,15 +29,11 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -59,7 +55,6 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> #include <iterator> #include <map> #include <utility> @@ -559,12 +554,12 @@ bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { } // Must be a CMP or an ext (of a value with nsw) then CMP else { - Instruction *UUser = dyn_cast<Instruction>(UU); + auto *UUser = cast<Instruction>(UU); // Skip SExt if we are extending an nsw value // TODO: Allow ZExt too - if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() && + if (BO->hasNoSignedWrap() && UUser->hasOneUse() && isa<SExtInst>(UUser)) - UUser = dyn_cast<Instruction>(*(UUser->user_begin())); + UUser = cast<Instruction>(*(UUser->user_begin())); if (!isCompareUsedByBranch(UUser)) return false; } diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 5ba137b1c85f..d9c33b5f335a 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -11,10 +11,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopRotation.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" @@ -22,9 +22,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopRotationUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -62,8 +60,8 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, MSSAU = MemorySSAUpdater(AR.MSSA); bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false, - Threshold, false, PrepareForLTO || PrepareForLTOOption); + MSSAU ? MSSAU.getPointer() : nullptr, SQ, false, Threshold, + false, PrepareForLTO || PrepareForLTOOption); if (!Changed) return PreservedAnalyses::all(); @@ -133,9 +131,8 @@ public: : MaxHeaderSize; return LoopRotation(L, LI, TTI, AC, &DT, &SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, - false, Threshold, false, - PrepareForLTO || PrepareForLTOOption); + MSSAU ? MSSAU.getPointer() : nullptr, SQ, false, + Threshold, false, PrepareForLTO || PrepareForLTOOption); } }; } // end namespace diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index d3fcba10c275..b7e0e32780b4 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -16,28 +16,21 @@ #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -261,13 +254,17 @@ private: assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() && "Malformed block sets?"); - // Now, all exit blocks that are not marked as live are dead. + // Now, all exit blocks that are not marked as live are dead, if all their + // predecessors are in the loop. This may not be the case, as the input loop + // may not by in loop-simplify/canonical form. SmallVector<BasicBlock *, 8> ExitBlocks; L.getExitBlocks(ExitBlocks); SmallPtrSet<BasicBlock *, 8> UniqueDeadExits; for (auto *ExitBlock : ExitBlocks) if (!LiveExitBlocks.count(ExitBlock) && - UniqueDeadExits.insert(ExitBlock).second) + UniqueDeadExits.insert(ExitBlock).second && + all_of(predecessors(ExitBlock), + [this](BasicBlock *Pred) { return L.contains(Pred); })) DeadExitBlocks.push_back(ExitBlock); // Whether or not the edge From->To will still be present in graph after the @@ -374,7 +371,7 @@ private: DeadInstructions.emplace_back(LandingPad); for (Instruction *I : DeadInstructions) { - I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); I->eraseFromParent(); } @@ -704,8 +701,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, MSSAU = MemorySSAUpdater(AR.MSSA); bool DeleteCurrentLoop = false; if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, - DeleteCurrentLoop)) + MSSAU ? MSSAU.getPointer() : nullptr, DeleteCurrentLoop)) return PreservedAnalyses::all(); if (DeleteCurrentLoop) @@ -739,9 +735,9 @@ public: if (MSSAA && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); bool DeleteCurrentLoop = false; - bool Changed = simplifyLoopCFG( - *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, - DeleteCurrentLoop); + bool Changed = + simplifyLoopCFG(*L, DT, LI, SE, MSSAU ? MSSAU.getPointer() : nullptr, + DeleteCurrentLoop); if (DeleteCurrentLoop) LPM.markLoopAsDeleted(*L); return Changed; diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp index c9c9e60d0921..dce1af475fb1 100644 --- a/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -34,24 +34,18 @@ #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -70,14 +64,6 @@ static cl::opt<unsigned> MaxNumberOfUseBBsForSinking( "max-uses-for-sinking", cl::Hidden, cl::init(30), cl::desc("Do not sink instructions that have too many uses.")); -static cl::opt<bool> EnableMSSAInLoopSink( - "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true), - cl::desc("Enable MemorySSA for LoopSink in new pass manager")); - -static cl::opt<bool> EnableMSSAInLegacyLoopSink( - "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false), - cl::desc("Enable MemorySSA for LoopSink in legacy pass manager")); - /// Return adjusted total frequency of \p BBs. /// /// * If there is only one BB, sinking instruction will not introduce code @@ -279,9 +265,8 @@ static bool sinkInstruction( static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI, - ScalarEvolution *SE, - AliasSetTracker *CurAST, - MemorySSA *MSSA) { + MemorySSA &MSSA, + ScalarEvolution *SE) { BasicBlock *Preheader = L.getLoopPreheader(); assert(Preheader && "Expected loop to have preheader"); @@ -297,13 +282,8 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, })) return false; - std::unique_ptr<MemorySSAUpdater> MSSAU; - std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags; - if (MSSA) { - MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); - LICMFlags = - std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA); - } + MemorySSAUpdater MSSAU(&MSSA); + SinkAndHoistLICMFlags LICMFlags(/*IsSink=*/true, &L, &MSSA); bool Changed = false; @@ -324,14 +304,15 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // on B (A appears after B), A needs to be sinked first before B can be // sinked. for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + if (isa<PHINode>(&I)) + continue; // No need to check for instruction's operands are loop invariant. assert(L.hasLoopInvariantOperands(&I) && "Insts in a loop's preheader should have loop invariant operands!"); - if (!canSinkOrHoistInst(I, &AA, &DT, &L, CurAST, MSSAU.get(), false, - LICMFlags.get())) + if (!canSinkOrHoistInst(I, &AA, &DT, &L, MSSAU, false, LICMFlags)) continue; if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI, - MSSAU.get())) + &MSSAU)) Changed = true; } @@ -340,13 +321,6 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, return Changed; } -static void computeAliasSet(Loop &L, BasicBlock &Preheader, - AliasSetTracker &CurAST) { - for (BasicBlock *BB : L.blocks()) - CurAST.add(*BB); - CurAST.add(Preheader); -} - PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); // Nothing to do if there are no loops. @@ -356,10 +330,7 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { AAResults &AA = FAM.getResult<AAManager>(F); DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); - - MemorySSA *MSSA = EnableMSSAInLoopSink - ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA() - : nullptr; + MemorySSA &MSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA(); // We want to do a postorder walk over the loops. Since loops are a tree this // is equivalent to a reversed preorder walk and preorder is easy to compute @@ -381,18 +352,11 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { if (!Preheader->getParent()->hasProfileData()) continue; - std::unique_ptr<AliasSetTracker> CurAST; - if (!EnableMSSAInLoopSink) { - CurAST = std::make_unique<AliasSetTracker>(AA); - computeAliasSet(L, *Preheader, *CurAST.get()); - } - // Note that we don't pass SCEV here because it is only used to invalidate // loops in SCEV and we don't preserve (or request) SCEV at all making that // unnecessary. - Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, - /*ScalarEvolution*/ nullptr, - CurAST.get(), MSSA); + Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, MSSA, + /*ScalarEvolution*/ nullptr); } while (!PreorderLoops.empty()); if (!Changed) @@ -400,13 +364,10 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); + PA.preserve<MemorySSAAnalysis>(); - if (MSSA) { - PA.preserve<MemorySSAAnalysis>(); - - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - } + if (VerifyMemorySSA) + MSSA.verifyMemorySSA(); return PA; } @@ -432,24 +393,16 @@ struct LegacyLoopSinkPass : public LoopPass { return false; AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); - std::unique_ptr<AliasSetTracker> CurAST; - MemorySSA *MSSA = nullptr; - if (EnableMSSAInLegacyLoopSink) - MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - else { - CurAST = std::make_unique<AliasSetTracker>(AA); - computeAliasSet(*L, *Preheader, *CurAST.get()); - } - bool Changed = sinkLoopInvariantInstructions( *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), getAnalysis<DominatorTreeWrapperPass>().getDomTree(), getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(), - SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA); + MSSA, SE ? &SE->getSE() : nullptr); - if (MSSA && VerifyMemorySSA) - MSSA->verifyMemorySSA(); + if (VerifyMemorySSA) + MSSA.verifyMemorySSA(); return Changed; } @@ -458,10 +411,8 @@ struct LegacyLoopSinkPass : public LoopPass { AU.setPreservesCFG(); AU.addRequired<BlockFrequencyInfoWrapperPass>(); getLoopAnalysisUsage(AU); - if (EnableMSSAInLegacyLoopSink) { - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } + AU.addRequired<MemorySSAWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); } }; } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 654f0d2a03a8..9959e408e2e2 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -78,6 +78,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -91,9 +92,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" -#include "llvm/IR/OperandTraits.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" @@ -114,12 +113,12 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> #include <iterator> #include <limits> #include <map> @@ -142,10 +141,7 @@ static const unsigned MaxIVUsers = 200; /// the salvaging is not too expensive for the compiler. static const unsigned MaxSCEVSalvageExpressionSize = 64; -// Temporary flag to cleanup congruent phis after LSR phi expansion. -// It's currently disabled until we can determine whether it's truly useful or -// not. The flag should be removed after the v3.0 release. -// This is now needed for ivchains. +// Cleanup congruent phis after LSR phi expansion. static cl::opt<bool> EnablePhiElim( "enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination")); @@ -481,6 +477,12 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { canonicalize(*L); } +static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) { + return SCEVExprContains(S, [&L](const SCEV *S) { + return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + }); +} + /// Check whether or not this formula satisfies the canonical /// representation. /// \see Formula::BaseRegs. @@ -494,18 +496,15 @@ bool Formula::isCanonical(const Loop &L) const { if (Scale == 1 && BaseRegs.empty()) return false; - const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); - if (SAR && SAR->getLoop() == &L) + if (containsAddRecDependentOnLoop(ScaledReg, L)) return true; // If ScaledReg is not a recurrent expr, or it is but its loop is not current // loop, meanwhile BaseRegs contains a recurrent expr reg related with current // loop, we want to swap the reg in BaseRegs with ScaledReg. - auto I = find_if(BaseRegs, [&](const SCEV *S) { - return isa<const SCEVAddRecExpr>(S) && - (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + return none_of(BaseRegs, [&L](const SCEV *S) { + return containsAddRecDependentOnLoop(S, L); }); - return I == BaseRegs.end(); } /// Helper method to morph a formula into its canonical representation. @@ -537,11 +536,9 @@ void Formula::canonicalize(const Loop &L) { // If ScaledReg is an invariant with respect to L, find the reg from // BaseRegs containing the recurrent expr related with Loop L. Swap the // reg with ScaledReg. - const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); - if (!SAR || SAR->getLoop() != &L) { - auto I = find_if(BaseRegs, [&](const SCEV *S) { - return isa<const SCEVAddRecExpr>(S) && - (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + if (!containsAddRecDependentOnLoop(ScaledReg, L)) { + auto I = find_if(BaseRegs, [&L](const SCEV *S) { + return containsAddRecDependentOnLoop(S, L); }); if (I != BaseRegs.end()) std::swap(ScaledReg, *I); @@ -1070,7 +1067,7 @@ public: C.ScaleCost = 0; } - bool isLess(Cost &Other); + bool isLess(const Cost &Other); void Lose(); @@ -1358,6 +1355,8 @@ void Cost::RateFormula(const Formula &F, const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU, SmallPtrSetImpl<const SCEV *> *LoserRegs) { + if (isLoser()) + return; assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); // Tally up the registers. unsigned PrevAddRecCost = C.AddRecCost; @@ -1467,7 +1466,7 @@ void Cost::Lose() { } /// Choose the lower cost. -bool Cost::isLess(Cost &Other) { +bool Cost::isLess(const Cost &Other) { if (InsnsCost.getNumOccurrences() > 0 && InsnsCost && C.Insns != Other.C.Insns) return C.Insns < Other.C.Insns; @@ -4081,23 +4080,24 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { continue; // Divide out the factor, ignoring high bits, since we'll be // scaling the value back up in the end. - if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { - // TODO: This could be optimized to avoid all the copying. - Formula F = Base; - F.ScaledReg = Quotient; - F.deleteBaseReg(F.BaseRegs[i]); - // The canonical representation of 1*reg is reg, which is already in - // Base. In that case, do not try to insert the formula, it will be - // rejected anyway. - if (F.Scale == 1 && (F.BaseRegs.empty() || - (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) - continue; - // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate - // non canonical Formula with ScaledReg's loop not being L. - if (F.Scale == 1 && LU.AllFixupsOutsideLoop) - F.canonicalize(*L); - (void)InsertFormula(LU, LUIdx, F); - } + if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) + if (!Quotient->isZero()) { + // TODO: This could be optimized to avoid all the copying. + Formula F = Base; + F.ScaledReg = Quotient; + F.deleteBaseReg(F.BaseRegs[i]); + // The canonical representation of 1*reg is reg, which is already in + // Base. In that case, do not try to insert the formula, it will be + // rejected anyway. + if (F.Scale == 1 && (F.BaseRegs.empty() || + (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) + continue; + // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate + // non canonical Formula with ScaledReg's loop not being L. + if (F.Scale == 1 && LU.AllFixupsOutsideLoop) + F.canonicalize(*L); + (void)InsertFormula(LU, LUIdx, F); + } } } } @@ -5601,6 +5601,27 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, DeadInsts.emplace_back(OperandIsInstr); } +// Check if there are any loop exit values which are only used once within the +// loop which may potentially be optimized with a call to rewriteLoopExitValue. +static bool LoopExitValHasSingleUse(Loop *L) { + BasicBlock *ExitBB = L->getExitBlock(); + if (!ExitBB) + return false; + + for (PHINode &ExitPhi : ExitBB->phis()) { + if (ExitPhi.getNumIncomingValues() != 1) + break; + + BasicBlock *Pred = ExitPhi.getIncomingBlock(0); + Value *IVNext = ExitPhi.getIncomingValueForBlock(Pred); + // One use would be the exit phi node, and there should be only one other + // use for this to be considered. + if (IVNext->getNumUses() == 2) + return true; + } + return false; +} + /// Rewrite all the fixup locations with new values, following the chosen /// solution. void LSRInstance::ImplementSolution( @@ -5894,40 +5915,57 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { } namespace { + +/// Enables more convenient iteration over a DWARF expression vector. +static iterator_range<llvm::DIExpression::expr_op_iterator> +ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) { + llvm::DIExpression::expr_op_iterator Begin = + llvm::DIExpression::expr_op_iterator(Expr.begin()); + llvm::DIExpression::expr_op_iterator End = + llvm::DIExpression::expr_op_iterator(Expr.end()); + return {Begin, End}; +} + struct SCEVDbgValueBuilder { SCEVDbgValueBuilder() = default; - SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { - Values = Base.Values; + SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); } + + void clone(const SCEVDbgValueBuilder &Base) { + LocationOps = Base.LocationOps; Expr = Base.Expr; } + void clear() { + LocationOps.clear(); + Expr.clear(); + } + /// The DIExpression as we translate the SCEV. SmallVector<uint64_t, 6> Expr; /// The location ops of the DIExpression. - SmallVector<llvm::ValueAsMetadata *, 2> Values; + SmallVector<Value *, 2> LocationOps; void pushOperator(uint64_t Op) { Expr.push_back(Op); } void pushUInt(uint64_t Operand) { Expr.push_back(Operand); } /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value /// in the set of values referenced by the expression. - void pushValue(llvm::Value *V) { + void pushLocation(llvm::Value *V) { Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg); - auto *It = - std::find(Values.begin(), Values.end(), llvm::ValueAsMetadata::get(V)); + auto *It = std::find(LocationOps.begin(), LocationOps.end(), V); unsigned ArgIndex = 0; - if (It != Values.end()) { - ArgIndex = std::distance(Values.begin(), It); + if (It != LocationOps.end()) { + ArgIndex = std::distance(LocationOps.begin(), It); } else { - ArgIndex = Values.size(); - Values.push_back(llvm::ValueAsMetadata::get(V)); + ArgIndex = LocationOps.size(); + LocationOps.push_back(V); } Expr.push_back(ArgIndex); } void pushValue(const SCEVUnknown *U) { llvm::Value *V = cast<SCEVUnknown>(U)->getValue(); - pushValue(V); + pushLocation(V); } bool pushConst(const SCEVConstant *C) { @@ -5938,6 +5976,12 @@ struct SCEVDbgValueBuilder { return true; } + // Iterating the expression as DWARF ops is convenient when updating + // DWARF_OP_LLVM_args. + iterator_range<llvm::DIExpression::expr_op_iterator> expr_ops() { + return ToDwarfOpIter(Expr); + } + /// Several SCEV types are sequences of the same arithmetic operator applied /// to constants and values that may be extended or truncated. bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr, @@ -5979,7 +6023,7 @@ struct SCEVDbgValueBuilder { } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (!U->getValue()) return false; - pushValue(U->getValue()); + pushLocation(U->getValue()); } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) { Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul); @@ -6010,52 +6054,6 @@ struct SCEVDbgValueBuilder { return Success; } - void setFinalExpression(llvm::DbgValueInst &DI, const DIExpression *OldExpr) { - // Re-state assumption that this dbg.value is not variadic. Any remaining - // opcodes in its expression operate on a single value already on the - // expression stack. Prepend our operations, which will re-compute and - // place that value on the expression stack. - assert(!DI.hasArgList()); - auto *NewExpr = - DIExpression::prependOpcodes(OldExpr, Expr, /*StackValue*/ true); - DI.setExpression(NewExpr); - - auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(Values); - DI.setRawLocation(llvm::DIArgList::get(DI.getContext(), ValArrayRef)); - } - - /// If a DVI can be emitted without a DIArgList, omit DW_OP_llvm_arg and the - /// location op index 0. - void setShortFinalExpression(llvm::DbgValueInst &DI, - const DIExpression *OldExpr) { - assert((Expr[0] == llvm::dwarf::DW_OP_LLVM_arg && Expr[1] == 0) && - "Expected DW_OP_llvm_arg and 0."); - DI.replaceVariableLocationOp( - 0u, llvm::MetadataAsValue::get(DI.getContext(), Values[0])); - - // See setFinalExpression: prepend our opcodes on the start of any old - // expression opcodes. - assert(!DI.hasArgList()); - llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2)); - auto *NewExpr = - DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true); - DI.setExpression(NewExpr); - } - - /// Once the IV and variable SCEV translation is complete, write it to the - /// source DVI. - void applyExprToDbgValue(llvm::DbgValueInst &DI, - const DIExpression *OldExpr) { - assert(!Expr.empty() && "Unexpected empty expression."); - // Emit a simpler form if only a single location is referenced. - if (Values.size() == 1 && Expr[0] == llvm::dwarf::DW_OP_LLVM_arg && - Expr[1] == 0) { - setShortFinalExpression(DI, OldExpr); - } else { - setFinalExpression(DI, OldExpr); - } - } - /// Return true if the combination of arithmetic operator and underlying /// SCEV constant value is an identity function. bool isIdentityFunction(uint64_t Op, const SCEV *S) { @@ -6104,6 +6102,48 @@ struct SCEVDbgValueBuilder { return true; } + /// Create an expression that is an offset from a value (usually the IV). + void createOffsetExpr(int64_t Offset, Value *OffsetValue) { + pushLocation(OffsetValue); + DIExpression::appendOffset(Expr, Offset); + LLVM_DEBUG( + dbgs() << "scev-salvage: Generated IV offset expression. Offset: " + << std::to_string(Offset) << "\n"); + } + + /// Combine a translation of the SCEV and the IV to create an expression that + /// recovers a location's value. + /// returns true if an expression was created. + bool createIterCountExpr(const SCEV *S, + const SCEVDbgValueBuilder &IterationCount, + ScalarEvolution &SE) { + // SCEVs for SSA values are most frquently of the form + // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..). + // This is because %a is a PHI node that is not the IV. However, these + // SCEVs have not been observed to result in debuginfo-lossy optimisations, + // so its not expected this point will be reached. + if (!isa<SCEVAddRecExpr>(S)) + return false; + + LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S + << '\n'); + + const auto *Rec = cast<SCEVAddRecExpr>(S); + if (!Rec->isAffine()) + return false; + + if (S->getExpressionSize() > MaxSCEVSalvageExpressionSize) + return false; + + // Initialise a new builder with the iteration count expression. In + // combination with the value's SCEV this enables recovery. + clone(IterationCount); + if (!SCEVToValueExpr(*Rec, SE)) + return false; + + return true; + } + /// Convert a SCEV of a value to a DIExpression that is pushed onto the /// builder's expression stack. The stack should already contain an /// expression for the iteration count, so that it can be multiplied by @@ -6133,74 +6173,294 @@ struct SCEVDbgValueBuilder { } return true; } + + // Append the current expression and locations to a location list and an + // expression list. Modify the DW_OP_LLVM_arg indexes to account for + // the locations already present in the destination list. + void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr, + SmallVectorImpl<Value *> &DestLocations) { + assert(!DestLocations.empty() && + "Expected the locations vector to contain the IV"); + // The DWARF_OP_LLVM_arg arguments of the expression being appended must be + // modified to account for the locations already in the destination vector. + // All builders contain the IV as the first location op. + assert(!LocationOps.empty() && + "Expected the location ops to contain the IV."); + // DestIndexMap[n] contains the index in DestLocations for the nth + // location in this SCEVDbgValueBuilder. + SmallVector<uint64_t, 2> DestIndexMap; + for (const auto &Op : LocationOps) { + auto It = find(DestLocations, Op); + if (It != DestLocations.end()) { + // Location already exists in DestLocations, reuse existing ArgIndex. + DestIndexMap.push_back(std::distance(DestLocations.begin(), It)); + continue; + } + // Location is not in DestLocations, add it. + DestIndexMap.push_back(DestLocations.size()); + DestLocations.push_back(Op); + } + + for (const auto &Op : expr_ops()) { + if (Op.getOp() != dwarf::DW_OP_LLVM_arg) { + Op.appendToVector(DestExpr); + continue; + } + + DestExpr.push_back(dwarf::DW_OP_LLVM_arg); + // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV, + // DestIndexMap[n] contains its new index in DestLocations. + uint64_t NewIndex = DestIndexMap[Op.getArg(0)]; + DestExpr.push_back(NewIndex); + } + } }; +/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs +/// and DIExpression. struct DVIRecoveryRec { + DVIRecoveryRec(DbgValueInst *DbgValue) + : DVI(DbgValue), Expr(DbgValue->getExpression()), + HadLocationArgList(false) {} + DbgValueInst *DVI; DIExpression *Expr; - Metadata *LocationOp; - const llvm::SCEV *SCEV; + bool HadLocationArgList; + SmallVector<WeakVH, 2> LocationOps; + SmallVector<const llvm::SCEV *, 2> SCEVs; + SmallVector<std::unique_ptr<SCEVDbgValueBuilder>, 2> RecoveryExprs; + + void clear() { + for (auto &RE : RecoveryExprs) + RE.reset(); + RecoveryExprs.clear(); + } + + ~DVIRecoveryRec() { clear(); } }; } // namespace -static void RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI, - const SCEVDbgValueBuilder &IterationCount, - ScalarEvolution &SE) { - // LSR may add locations to previously single location-op DVIs which - // are currently not supported. - if (CachedDVI.DVI->getNumVariableLocationOps() != 1) - return; +/// Returns the total number of DW_OP_llvm_arg operands in the expression. +/// This helps in determining if a DIArglist is necessary or can be omitted from +/// the dbg.value. +static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) { + auto expr_ops = ToDwarfOpIter(Expr); + unsigned Count = 0; + for (auto Op : expr_ops) + if (Op.getOp() == dwarf::DW_OP_LLVM_arg) + Count++; + return Count; +} - // SCEVs for SSA values are most frquently of the form - // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..). - // This is because %a is a PHI node that is not the IV. However, these - // SCEVs have not been observed to result in debuginfo-lossy optimisations, - // so its not expected this point will be reached. - if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV)) - return; +/// Overwrites DVI with the location and Ops as the DIExpression. This will +/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands, +/// because a DIArglist is not created for the first argument of the dbg.value. +static void updateDVIWithLocation(DbgValueInst &DVI, Value *Location, + SmallVectorImpl<uint64_t> &Ops) { + assert( + numLLVMArgOps(Ops) == 0 && + "Expected expression that does not contain any DW_OP_llvm_arg operands."); + DVI.setRawLocation(ValueAsMetadata::get(Location)); + DVI.setExpression(DIExpression::get(DVI.getContext(), Ops)); +} - LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: " - << *CachedDVI.SCEV << '\n'); +/// Overwrite DVI with locations placed into a DIArglist. +static void updateDVIWithLocations(DbgValueInst &DVI, + SmallVectorImpl<Value *> &Locations, + SmallVectorImpl<uint64_t> &Ops) { + assert(numLLVMArgOps(Ops) != 0 && + "Expected expression that references DIArglist locations using " + "DW_OP_llvm_arg operands."); + SmallVector<ValueAsMetadata *, 3> MetadataLocs; + for (Value *V : Locations) + MetadataLocs.push_back(ValueAsMetadata::get(V)); + auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs); + DVI.setRawLocation(llvm::DIArgList::get(DVI.getContext(), ValArrayRef)); + DVI.setExpression(DIExpression::get(DVI.getContext(), Ops)); +} - const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV); - if (!Rec->isAffine()) - return; +/// Write the new expression and new location ops for the dbg.value. If possible +/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This +/// can be omitted if: +/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg. +/// 2. The DW_OP_LLVM_arg is the first operand in the expression. +static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, + SmallVectorImpl<Value *> &NewLocationOps, + SmallVectorImpl<uint64_t> &NewExpr) { + unsigned NumLLVMArgs = numLLVMArgOps(NewExpr); + if (NumLLVMArgs == 0) { + // Location assumed to be on the stack. + updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], NewExpr); + } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) { + // There is only a single DW_OP_llvm_arg at the start of the expression, + // so it can be omitted along with DIArglist. + assert(NewExpr[1] == 0 && + "Lone LLVM_arg in a DIExpression should refer to location-op 0."); + llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2)); + updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], ShortenedOps); + } else { + // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary. + updateDVIWithLocations(*DVIRec.DVI, NewLocationOps, NewExpr); + } - if (CachedDVI.SCEV->getExpressionSize() > MaxSCEVSalvageExpressionSize) - return; + // If the DIExpression was previously empty then add the stack terminator. + // Non-empty expressions have only had elements inserted into them and so the + // terminator should already be present e.g. stack_value or fragment. + DIExpression *SalvageExpr = DVIRec.DVI->getExpression(); + if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) { + SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value}); + DVIRec.DVI->setExpression(SalvageExpr); + } +} - // Initialise a new builder with the iteration count expression. In - // combination with the value's SCEV this enables recovery. - SCEVDbgValueBuilder RecoverValue(IterationCount); - if (!RecoverValue.SCEVToValueExpr(*Rec, SE)) - return; +/// Cached location ops may be erased during LSR, in which case an undef is +/// required when restoring from the cache. The type of that location is no +/// longer available, so just use int8. The undef will be replaced by one or +/// more locations later when a SCEVDbgValueBuilder selects alternative +/// locations to use for the salvage. +static Value *getValueOrUndef(WeakVH &VH, LLVMContext &C) { + return (VH) ? VH : UndefValue::get(llvm::Type::getInt8Ty(C)); +} + +/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values. +static void restorePreTransformState(DVIRecoveryRec &DVIRec) { + LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n" + << "scev-salvage: post-LSR: " << *DVIRec.DVI << '\n'); + assert(DVIRec.Expr && "Expected an expression"); + DVIRec.DVI->setExpression(DVIRec.Expr); - LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n'); - RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr); - LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n'); + // Even a single location-op may be inside a DIArgList and referenced with + // DW_OP_LLVM_arg, which is valid only with a DIArgList. + if (!DVIRec.HadLocationArgList) { + assert(DVIRec.LocationOps.size() == 1 && + "Unexpected number of location ops."); + // LSR's unsuccessful salvage attempt may have added DIArgList, which in + // this case was not present before, so force the location back to a single + // uncontained Value. + Value *CachedValue = + getValueOrUndef(DVIRec.LocationOps[0], DVIRec.DVI->getContext()); + DVIRec.DVI->setRawLocation(ValueAsMetadata::get(CachedValue)); + } else { + SmallVector<ValueAsMetadata *, 3> MetadataLocs; + for (WeakVH VH : DVIRec.LocationOps) { + Value *CachedValue = getValueOrUndef(VH, DVIRec.DVI->getContext()); + MetadataLocs.push_back(ValueAsMetadata::get(CachedValue)); + } + auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs); + DVIRec.DVI->setRawLocation( + llvm::DIArgList::get(DVIRec.DVI->getContext(), ValArrayRef)); + } + LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DVIRec.DVI << '\n'); } -static void RewriteDVIUsingOffset(DVIRecoveryRec &DVIRec, llvm::PHINode &IV, - int64_t Offset) { - assert(!DVIRec.DVI->hasArgList() && "Expected single location-op dbg.value."); - DbgValueInst *DVI = DVIRec.DVI; - SmallVector<uint64_t, 8> Ops; - DIExpression::appendOffset(Ops, Offset); - DIExpression *Expr = DIExpression::prependOpcodes(DVIRec.Expr, Ops, true); - LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *DVIRec.DVI << '\n'); - DVI->setExpression(Expr); - llvm::Value *ValIV = dyn_cast<llvm::Value>(&IV); - DVI->replaceVariableLocationOp( - 0u, llvm::MetadataAsValue::get(DVI->getContext(), - llvm::ValueAsMetadata::get(ValIV))); - LLVM_DEBUG(dbgs() << "scev-salvage: updated with offset to IV: " - << *DVIRec.DVI << '\n'); +static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, + llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, + const SCEV *SCEVInductionVar, + SCEVDbgValueBuilder IterCountExpr) { + if (!DVIRec.DVI->isUndef()) + return false; + + // LSR may have caused several changes to the dbg.value in the failed salvage + // attempt. So restore the DIExpression, the location ops and also the + // location ops format, which is always DIArglist for multiple ops, but only + // sometimes for a single op. + restorePreTransformState(DVIRec); + + // LocationOpIndexMap[i] will store the post-LSR location index of + // the non-optimised out location at pre-LSR index i. + SmallVector<int64_t, 2> LocationOpIndexMap; + LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1); + SmallVector<Value *, 2> NewLocationOps; + NewLocationOps.push_back(LSRInductionVar); + + for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) { + WeakVH VH = DVIRec.LocationOps[i]; + // Place the locations not optimised out in the list first, avoiding + // inserts later. The map is used to update the DIExpression's + // DW_OP_LLVM_arg arguments as the expression is updated. + if (VH && !isa<UndefValue>(VH)) { + NewLocationOps.push_back(VH); + LocationOpIndexMap[i] = NewLocationOps.size() - 1; + LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i + << " now at index " << LocationOpIndexMap[i] << "\n"); + continue; + } + + // It's possible that a value referred to in the SCEV may have been + // optimised out by LSR. + if (SE.containsErasedValue(DVIRec.SCEVs[i]) || + SE.containsUndefs(DVIRec.SCEVs[i])) { + LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i + << " refers to a location that is now undef or erased. " + "Salvage abandoned.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i + << " with SCEV: " << *DVIRec.SCEVs[i] << "\n"); + + DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>(); + SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get(); + + // Create an offset-based salvage expression if possible, as it requires + // less DWARF ops than an iteration count-based expression. + if (Optional<APInt> Offset = + SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) { + if (Offset.getValue().getMinSignedBits() <= 64) + SalvageExpr->createOffsetExpr(Offset.getValue().getSExtValue(), + LSRInductionVar); + } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr, + SE)) + return false; + } + + // Merge the DbgValueBuilder generated expressions and the original + // DIExpression, place the result into an new vector. + SmallVector<uint64_t, 3> NewExpr; + if (DVIRec.Expr->getNumElements() == 0) { + assert(DVIRec.RecoveryExprs.size() == 1 && + "Expected only a single recovery expression for an empty " + "DIExpression."); + assert(DVIRec.RecoveryExprs[0] && + "Expected a SCEVDbgSalvageBuilder for location 0"); + SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get(); + B->appendToVectors(NewExpr, NewLocationOps); + } + for (const auto &Op : DVIRec.Expr->expr_ops()) { + // Most Ops needn't be updated. + if (Op.getOp() != dwarf::DW_OP_LLVM_arg) { + Op.appendToVector(NewExpr); + continue; + } + + uint64_t LocationArgIndex = Op.getArg(0); + SCEVDbgValueBuilder *DbgBuilder = + DVIRec.RecoveryExprs[LocationArgIndex].get(); + // The location doesn't have s SCEVDbgValueBuilder, so LSR did not + // optimise it away. So just translate the argument to the updated + // location index. + if (!DbgBuilder) { + NewExpr.push_back(dwarf::DW_OP_LLVM_arg); + assert(LocationOpIndexMap[Op.getArg(0)] != -1 && + "Expected a positive index for the location-op position."); + NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]); + continue; + } + // The location has a recovery expression. + DbgBuilder->appendToVectors(NewExpr, NewLocationOps); + } + + UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr); + LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DVI << "\n"); + return true; } +/// Obtain an expression for the iteration count, then attempt to salvage the +/// dbg.value intrinsics. static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, - SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) { + SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) { if (DVIToUpdate.empty()) return; @@ -6213,49 +6473,22 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, if (!IVAddRec->isAffine()) return; + // Prevent translation using excessive resources. if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize) return; // The iteration count is required to recover location values. SCEVDbgValueBuilder IterCountExpr; - IterCountExpr.pushValue(LSRInductionVar); + IterCountExpr.pushLocation(LSRInductionVar); if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE)) return; LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar << '\n'); - // Needn't salvage if the location op hasn't been undef'd by LSR. for (auto &DVIRec : DVIToUpdate) { - if (!DVIRec.DVI->isUndef()) - continue; - - // Some DVIs that were single location-op when cached are now multi-op, - // due to LSR optimisations. However, multi-op salvaging is not yet - // supported by SCEV salvaging. But, we can attempt a salvage by restoring - // the pre-LSR single-op expression. - if (DVIRec.DVI->hasArgList()) { - if (!DVIRec.DVI->getVariableLocationOp(0)) - continue; - llvm::Type *Ty = DVIRec.DVI->getVariableLocationOp(0)->getType(); - DVIRec.DVI->setRawLocation( - llvm::ValueAsMetadata::get(UndefValue::get(Ty))); - DVIRec.DVI->setExpression(DVIRec.Expr); - } - - LLVM_DEBUG(dbgs() << "scev-salvage: value to recover SCEV: " - << *DVIRec.SCEV << '\n'); - - // Create a simple expression if the IV and value to salvage SCEVs - // start values differ by only a constant value. - if (Optional<APInt> Offset = - SE.computeConstantDifference(DVIRec.SCEV, SCEVInductionVar)) { - if (Offset.getValue().getMinSignedBits() <= 64) - RewriteDVIUsingOffset(DVIRec, *LSRInductionVar, - Offset.getValue().getSExtValue()); - } else { - RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE); - } + SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar, + IterCountExpr); } } } @@ -6263,39 +6496,53 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, /// Identify and cache salvageable DVI locations and expressions along with the /// corresponding SCEV(s). Also ensure that the DVI is not deleted between /// cacheing and salvaging. -static void -DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, - SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs, - SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) { +static void DbgGatherSalvagableDVI( + Loop *L, ScalarEvolution &SE, + SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs, + SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) { for (auto &B : L->getBlocks()) { for (auto &I : *B) { auto DVI = dyn_cast<DbgValueInst>(&I); if (!DVI) continue; - + // Ensure that if any location op is undef that the dbg.vlue is not + // cached. if (DVI->isUndef()) continue; - if (DVI->hasArgList()) - continue; + // Check that the location op SCEVs are suitable for translation to + // DIExpression. + const auto &HasTranslatableLocationOps = + [&](const DbgValueInst *DVI) -> bool { + for (const auto LocOp : DVI->location_ops()) { + if (!LocOp) + return false; - if (!DVI->getVariableLocationOp(0) || - !SE.isSCEVable(DVI->getVariableLocationOp(0)->getType())) - continue; + if (!SE.isSCEVable(LocOp->getType())) + return false; - // SCEVUnknown wraps an llvm::Value, it does not have a start and stride. - // Therefore no translation to DIExpression is performed. - const SCEV *S = SE.getSCEV(DVI->getVariableLocationOp(0)); - if (isa<SCEVUnknown>(S)) - continue; + const SCEV *S = SE.getSCEV(LocOp); + if (SE.containsUndefs(S)) + return false; + } + return true; + }; - // Avoid wasting resources generating an expression containing undef. - if (SE.containsUndefs(S)) + if (!HasTranslatableLocationOps(DVI)) continue; - SalvageableDVISCEVs.push_back( - {DVI, DVI->getExpression(), DVI->getRawLocation(), - SE.getSCEV(DVI->getVariableLocationOp(0))}); + std::unique_ptr<DVIRecoveryRec> NewRec = + std::make_unique<DVIRecoveryRec>(DVI); + // Each location Op may need a SCEVDbgValueBuilder in order to recover it. + // Pre-allocating a vector will enable quick lookups of the builder later + // during the salvage. + NewRec->RecoveryExprs.resize(DVI->getNumVariableLocationOps()); + for (const auto LocOp : DVI->location_ops()) { + NewRec->SCEVs.push_back(SE.getSCEV(LocOp)); + NewRec->LocationOps.push_back(LocOp); + NewRec->HadLocationArgList = DVI->hasArgList(); + } + SalvageableDVISCEVs.push_back(std::move(NewRec)); DVIHandles.insert(DVI); } } @@ -6344,9 +6591,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // Debug preservation - before we start removing anything identify which DVI // meet the salvageable criteria and store their DIExpression and SCEVs. - SmallVector<DVIRecoveryRec, 2> SalvageableDVI; + SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords; SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles; - DbgGatherSalvagableDVI(L, SE, SalvageableDVI, DVIHandles); + DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles); bool Changed = false; std::unique_ptr<MemorySSAUpdater> MSSAU; @@ -6375,8 +6622,26 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); } } + // LSR may at times remove all uses of an induction variable from a loop. + // The only remaining use is the PHI in the exit block. + // When this is the case, if the exit value of the IV can be calculated using + // SCEV, we can replace the exit block PHI with the final value of the IV and + // skip the updates in each loop iteration. + if (L->isRecursivelyLCSSAForm(DT, LI) && LoopExitValHasSingleUse(L)) { + SmallVector<WeakTrackingVH, 16> DeadInsts; + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Rewriter(SE, DL, "lsr", false); + int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT, + OnlyCheapRepl, DeadInsts); + if (Rewrites) { + Changed = true; + RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, + MSSAU.get()); + DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); + } + } - if (SalvageableDVI.empty()) + if (SalvageableDVIRecords.empty()) return Changed; // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with @@ -6384,13 +6649,16 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // TODO: Allow for multiple IV references for nested AddRecSCEVs for (auto &L : LI) { if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer)) - DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVI); + DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords); else { LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV " "could not be identified.\n"); } } + for (auto &Rec : SalvageableDVIRecords) + Rec->clear(); + SalvageableDVIRecords.clear(); DVIHandles.clear(); return Changed; } diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 1ecbb86724e1..8c2868563227 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -42,10 +43,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/LCSSA.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopPeel.h" -#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <cassert> @@ -331,14 +330,23 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); Loop *SubLoop = L->getSubLoops()[0]; - unsigned InnerLoopSize = + InstructionCost InnerLoopSizeIC = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); - unsigned OuterLoopSize = + InstructionCost OuterLoopSizeIC = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); - LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n"); - LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n"); + LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSizeIC << "\n"); + LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSizeIC << "\n"); + + if (!InnerLoopSizeIC.isValid() || !OuterLoopSizeIC.isValid()) { + LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" + << " with invalid cost.\n"); + return LoopUnrollResult::Unmodified; + } + unsigned InnerLoopSize = *InnerLoopSizeIC.getValue(); + unsigned OuterLoopSize = *OuterLoopSizeIC.getValue(); + if (NotDuplicatable) { LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable " "instructions.\n"); @@ -364,7 +372,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID( OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderInner}); - if (NewInnerEpilogueLoopID.hasValue()) + if (NewInnerEpilogueLoopID) SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); // Find trip count and trip multiple @@ -394,14 +402,14 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID( OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderOuter}); - if (NewOuterEpilogueLoopID.hasValue()) + if (NewOuterEpilogueLoopID) EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue()); } Optional<MDNode *> NewInnerLoopID = makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupInner}); - if (NewInnerLoopID.hasValue()) + if (NewInnerLoopID) SubLoop->setLoopID(NewInnerLoopID.getValue()); else SubLoop->setLoopID(OrigSubLoopID); @@ -410,7 +418,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID( OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter}); - if (NewOuterLoopID.hasValue()) { + if (NewOuterLoopID) { L->setLoopID(NewOuterLoopID.getValue()); // Do not setLoopAlreadyUnrolled if a followup was given. diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 9beb2281cf0f..fda86afe5f9d 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -133,7 +132,7 @@ static cl::opt<bool> UnrollAllowRemainder( "when unrolling a loop.")); static cl::opt<bool> - UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, + UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); static cl::opt<unsigned> UnrollMaxUpperBound( @@ -254,19 +253,19 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; // Apply user values provided by argument - if (UserThreshold.hasValue()) { + if (UserThreshold) { UP.Threshold = *UserThreshold; UP.PartialThreshold = *UserThreshold; } - if (UserCount.hasValue()) + if (UserCount) UP.Count = *UserCount; - if (UserAllowPartial.hasValue()) + if (UserAllowPartial) UP.Partial = *UserAllowPartial; - if (UserRuntime.hasValue()) + if (UserRuntime) UP.Runtime = *UserRuntime; - if (UserUpperBound.hasValue()) + if (UserUpperBound) UP.UpperBound = *UserUpperBound; - if (UserFullUnrollMaxCount.hasValue()) + if (UserFullUnrollMaxCount) UP.FullUnrollMaxCount = *UserFullUnrollMaxCount; return UP; @@ -664,7 +663,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost( } /// ApproximateLoopSize - Approximate the size of the loop. -unsigned llvm::ApproximateLoopSize( +InstructionCost llvm::ApproximateLoopSize( const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { @@ -675,7 +674,7 @@ unsigned llvm::ApproximateLoopSize( NotDuplicatable = Metrics.notDuplicatable; Convergent = Metrics.convergent; - unsigned LoopSize = Metrics.NumInsts; + InstructionCost LoopSize = Metrics.NumInsts; // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's @@ -683,7 +682,9 @@ unsigned llvm::ApproximateLoopSize( // that each loop has at least three instructions (likely a conditional // branch, a comparison feeding that branch, and some kind of loop increment // feeding that comparison instruction). - LoopSize = std::max(LoopSize, BEInsns + 1); + if (LoopSize.isValid() && *LoopSize.getValue() < BEInsns + 1) + // This is an open coded max() on InstructionCost + LoopSize = BEInsns + 1; return LoopSize; } @@ -788,15 +789,13 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, // 2nd priority is unroll count set by pragma. if (PInfo.PragmaCount > 0) { - if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) && - UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold) + if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0))) return PInfo.PragmaCount; } - if (PInfo.PragmaFullUnroll && TripCount != 0) { - if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold) - return TripCount; - } + if (PInfo.PragmaFullUnroll && TripCount != 0) + return TripCount; + // if didn't return until here, should continue to other priorties return None; } @@ -912,7 +911,7 @@ bool llvm::computeUnrollCount( if (PP.PeelCount) { if (UnrollCount.getNumOccurrences() > 0) { report_fatal_error("Cannot specify both explicit peel count and " - "explicit unroll count"); + "explicit unroll count", /*GenCrashDiag=*/false); } UP.Count = 1; UP.Runtime = false; @@ -1192,10 +1191,18 @@ static LoopUnrollResult tryToUnrollLoop( SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); - unsigned LoopSize = + InstructionCost LoopSizeIC = ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); - LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSizeIC << "\n"); + + if (!LoopSizeIC.isValid()) { + LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" + << " with invalid cost.\n"); + return LoopUnrollResult::Unmodified; + } + unsigned LoopSize = *LoopSizeIC.getValue(); + if (NotDuplicatable) { LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" << " instructions.\n"); @@ -1316,7 +1323,7 @@ static LoopUnrollResult tryToUnrollLoop( Optional<MDNode *> RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); - if (RemainderLoopID.hasValue()) + if (RemainderLoopID) RemainderLoop->setLoopID(RemainderLoopID.getValue()); } @@ -1324,7 +1331,7 @@ static LoopUnrollResult tryToUnrollLoop( Optional<MDNode *> NewLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupUnrolled}); - if (NewLoopID.hasValue()) { + if (NewLoopID) { L->setLoopID(NewLoopID.getValue()); // Do not setLoopAlreadyUnrolled if loop attributes have been specified @@ -1548,8 +1555,12 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, PreservedAnalyses LoopUnrollPass::run(Function &F, FunctionAnalysisManager &AM) { - auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); + // There are no loops in the function. Return before computing other expensive + // analyses. + if (LI.empty()) + return PreservedAnalyses::all(); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp deleted file mode 100644 index 76bb5497c2c2..000000000000 --- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ /dev/null @@ -1,1774 +0,0 @@ -//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass transforms loops that contain branches on loop-invariant conditions -// to multiple loops. For example, it turns the left into the right code: -// -// for (...) if (lic) -// A for (...) -// if (lic) A; B; C -// B else -// C for (...) -// A; C -// -// This can increase the size of the code exponentially (doubling it every time -// a loop is unswitched) so we only unswitch if the resultant code will be -// smaller than a threshold. -// -// This pass expects LICM to be run before it to hoist invariant conditions out -// of the loop, to make the unswitching opportunity obvious. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LazyBlockFrequencyInfo.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/MemorySSA.h" -#include "llvm/Analysis/MemorySSAUpdater.h" -#include "llvm/Analysis/MustExecute.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/User.h" -#include "llvm/IR/Value.h" -#include "llvm/IR/ValueHandle.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include <algorithm> -#include <cassert> -#include <map> -#include <set> -#include <tuple> -#include <utility> -#include <vector> - -using namespace llvm; - -#define DEBUG_TYPE "loop-unswitch" - -STATISTIC(NumBranches, "Number of branches unswitched"); -STATISTIC(NumSwitches, "Number of switches unswitched"); -STATISTIC(NumGuards, "Number of guards unswitched"); -STATISTIC(NumSelects , "Number of selects unswitched"); -STATISTIC(NumTrivial , "Number of unswitches that are trivial"); -STATISTIC(NumSimplify, "Number of simplifications of unswitched code"); -STATISTIC(TotalInsts, "Total number of instructions analyzed"); - -// The specific value of 100 here was chosen based only on intuition and a -// few specific examples. -static cl::opt<unsigned> -Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), - cl::init(100), cl::Hidden); - -static cl::opt<unsigned> - MSSAThreshold("loop-unswitch-memoryssa-threshold", - cl::desc("Max number of memory uses to explore during " - "partial unswitching analysis"), - cl::init(100), cl::Hidden); - -namespace { - - class LUAnalysisCache { - using UnswitchedValsMap = - DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>; - using UnswitchedValsIt = UnswitchedValsMap::iterator; - - struct LoopProperties { - unsigned CanBeUnswitchedCount; - unsigned WasUnswitchedCount; - unsigned SizeEstimation; - UnswitchedValsMap UnswitchedVals; - }; - - // Here we use std::map instead of DenseMap, since we need to keep valid - // LoopProperties pointer for current loop for better performance. - using LoopPropsMap = std::map<const Loop *, LoopProperties>; - using LoopPropsMapIt = LoopPropsMap::iterator; - - LoopPropsMap LoopsProperties; - UnswitchedValsMap *CurLoopInstructions = nullptr; - LoopProperties *CurrentLoopProperties = nullptr; - - // A loop unswitching with an estimated cost above this threshold - // is not performed. MaxSize is turned into unswitching quota for - // the current loop, and reduced correspondingly, though note that - // the quota is returned by releaseMemory() when the loop has been - // processed, so that MaxSize will return to its previous - // value. So in most cases MaxSize will equal the Threshold flag - // when a new loop is processed. An exception to that is that - // MaxSize will have a smaller value while processing nested loops - // that were introduced due to loop unswitching of an outer loop. - // - // FIXME: The way that MaxSize works is subtle and depends on the - // pass manager processing loops and calling releaseMemory() in a - // specific order. It would be good to find a more straightforward - // way of doing what MaxSize does. - unsigned MaxSize; - - public: - LUAnalysisCache() : MaxSize(Threshold) {} - - // Analyze loop. Check its size, calculate is it possible to unswitch - // it. Returns true if we can unswitch this loop. - bool countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionCache *AC); - - // Clean all data related to given loop. - void forgetLoop(const Loop *L); - - // Mark case value as unswitched. - // Since SI instruction can be partly unswitched, in order to avoid - // extra unswitching in cloned loops keep track all unswitched values. - void setUnswitched(const SwitchInst *SI, const Value *V); - - // Check was this case value unswitched before or not. - bool isUnswitched(const SwitchInst *SI, const Value *V); - - // Returns true if another unswitching could be done within the cost - // threshold. - bool costAllowsUnswitching(); - - // Clone all loop-unswitch related loop properties. - // Redistribute unswitching quotas. - // Note, that new loop data is stored inside the VMap. - void cloneData(const Loop *NewLoop, const Loop *OldLoop, - const ValueToValueMapTy &VMap); - }; - - class LoopUnswitch : public LoopPass { - LoopInfo *LI; // Loop information - LPPassManager *LPM; - AssumptionCache *AC; - - // Used to check if second loop needs processing after - // rewriteLoopBodyWithConditionConstant rewrites first loop. - std::vector<Loop*> LoopProcessWorklist; - - LUAnalysisCache BranchesInfo; - - bool OptimizeForSize; - bool RedoLoop = false; - - Loop *CurrentLoop = nullptr; - DominatorTree *DT = nullptr; - MemorySSA *MSSA = nullptr; - AAResults *AA = nullptr; - std::unique_ptr<MemorySSAUpdater> MSSAU; - BasicBlock *LoopHeader = nullptr; - BasicBlock *LoopPreheader = nullptr; - - bool SanitizeMemory; - SimpleLoopSafetyInfo SafetyInfo; - - // LoopBlocks contains all of the basic blocks of the loop, including the - // preheader of the loop, the body of the loop, and the exit blocks of the - // loop, in that order. - std::vector<BasicBlock*> LoopBlocks; - // NewBlocks contained cloned copy of basic blocks from LoopBlocks. - std::vector<BasicBlock*> NewBlocks; - - bool HasBranchDivergence; - - public: - static char ID; // Pass ID, replacement for typeid - - explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false) - : LoopPass(ID), OptimizeForSize(Os), - HasBranchDivergence(HasBranchDivergence) { - initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool processCurrentLoop(); - bool isUnreachableDueToPreviousUnswitching(BasicBlock *); - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - // Lazy BFI and BPI are marked as preserved here so Loop Unswitching - // can remain part of the same loop pass as LICM - AU.addPreserved<LazyBlockFrequencyInfoPass>(); - AU.addPreserved<LazyBranchProbabilityInfoPass>(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - if (HasBranchDivergence) - AU.addRequired<LegacyDivergenceAnalysis>(); - getLoopAnalysisUsage(AU); - } - - private: - void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); } - - void initLoopData() { - LoopHeader = CurrentLoop->getHeader(); - LoopPreheader = CurrentLoop->getLoopPreheader(); - } - - /// Split all of the edges from inside the loop to their exit blocks. - /// Update the appropriate Phi nodes as we do so. - void splitExitEdges(Loop *L, - const SmallVectorImpl<BasicBlock *> &ExitBlocks); - - bool tryTrivialLoopUnswitch(bool &Changed); - - bool unswitchIfProfitable(Value *LoopCond, Constant *Val, - Instruction *TI = nullptr, - ArrayRef<Instruction *> ToDuplicate = {}); - void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, - BasicBlock *ExitBlock, Instruction *TI); - void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L, - Instruction *TI, - ArrayRef<Instruction *> ToDuplicate = {}); - - void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, - Constant *Val, bool IsEqual); - - void - emitPreheaderBranchOnCondition(Value *LIC, Constant *Val, - BasicBlock *TrueDest, BasicBlock *FalseDest, - BranchInst *OldBranch, Instruction *TI, - ArrayRef<Instruction *> ToDuplicate = {}); - - void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L); - - /// Given that the Invariant is not equal to Val. Simplify instructions - /// in the loop. - Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant, - Constant *Val); - }; - -} // end anonymous namespace - -// Analyze loop. Check its size, calculate is it possible to unswitch -// it. Returns true if we can unswitch this loop. -bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, - AssumptionCache *AC) { - LoopPropsMapIt PropsIt; - bool Inserted; - std::tie(PropsIt, Inserted) = - LoopsProperties.insert(std::make_pair(L, LoopProperties())); - - LoopProperties &Props = PropsIt->second; - - if (Inserted) { - // New loop. - - // Limit the number of instructions to avoid causing significant code - // expansion, and the number of basic blocks, to avoid loops with - // large numbers of branches which cause loop unswitching to go crazy. - // This is a very ad-hoc heuristic. - - SmallPtrSet<const Value *, 32> EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - - // FIXME: This is overly conservative because it does not take into - // consideration code simplification opportunities and code that can - // be shared by the resultant unswitched loops. - CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, TTI, EphValues); - - Props.SizeEstimation = Metrics.NumInsts; - Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); - Props.WasUnswitchedCount = 0; - MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; - - if (Metrics.notDuplicatable) { - LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName() - << ", contents cannot be " - << "duplicated!\n"); - return false; - } - } - - // Be careful. This links are good only before new loop addition. - CurrentLoopProperties = &Props; - CurLoopInstructions = &Props.UnswitchedVals; - - return true; -} - -// Clean all data related to given loop. -void LUAnalysisCache::forgetLoop(const Loop *L) { - LoopPropsMapIt LIt = LoopsProperties.find(L); - - if (LIt != LoopsProperties.end()) { - LoopProperties &Props = LIt->second; - MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) * - Props.SizeEstimation; - LoopsProperties.erase(LIt); - } - - CurrentLoopProperties = nullptr; - CurLoopInstructions = nullptr; -} - -// Mark case value as unswitched. -// Since SI instruction can be partly unswitched, in order to avoid -// extra unswitching in cloned loops keep track all unswitched values. -void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) { - (*CurLoopInstructions)[SI].insert(V); -} - -// Check was this case value unswitched before or not. -bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) { - return (*CurLoopInstructions)[SI].count(V); -} - -bool LUAnalysisCache::costAllowsUnswitching() { - return CurrentLoopProperties->CanBeUnswitchedCount > 0; -} - -// Clone all loop-unswitch related loop properties. -// Redistribute unswitching quotas. -// Note, that new loop data is stored inside the VMap. -void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, - const ValueToValueMapTy &VMap) { - LoopProperties &NewLoopProps = LoopsProperties[NewLoop]; - LoopProperties &OldLoopProps = *CurrentLoopProperties; - UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals; - - // Reallocate "can-be-unswitched quota" - - --OldLoopProps.CanBeUnswitchedCount; - ++OldLoopProps.WasUnswitchedCount; - NewLoopProps.WasUnswitchedCount = 0; - unsigned Quota = OldLoopProps.CanBeUnswitchedCount; - NewLoopProps.CanBeUnswitchedCount = Quota / 2; - OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2; - - NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation; - - // Clone unswitched values info: - // for new loop switches we clone info about values that was - // already unswitched and has redundant successors. - for (const auto &I : Insts) { - const SwitchInst *OldInst = I.first; - Value *NewI = VMap.lookup(OldInst); - const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI); - assert(NewInst && "All instructions that are in SrcBB must be in VMap."); - - NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; - } -} - -char LoopUnswitch::ID = 0; - -INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", - false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) -INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", - false, false) - -Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) { - return new LoopUnswitch(Os, HasBranchDivergence); -} - -/// Operator chain lattice. -enum OperatorChain { - OC_OpChainNone, ///< There is no operator. - OC_OpChainOr, ///< There are only ORs. - OC_OpChainAnd, ///< There are only ANDs. - OC_OpChainMixed ///< There are ANDs and ORs. -}; - -/// Cond is a condition that occurs in L. If it is invariant in the loop, or has -/// an invariant piece, return the invariant. Otherwise, return null. -// -/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a -/// mixed operator chain, as we can not reliably find a value which will -/// simplify the operator chain. If the chain is AND-only or OR-only, we can use -/// 0 or ~0 to simplify the chain. -/// -/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to -/// simplify the condition itself to a loop variant condition, but at the -/// cost of creating an entirely new loop. -static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, - OperatorChain &ParentChain, - DenseMap<Value *, Value *> &Cache, - MemorySSAUpdater *MSSAU) { - auto CacheIt = Cache.find(Cond); - if (CacheIt != Cache.end()) - return CacheIt->second; - - // We started analyze new instruction, increment scanned instructions counter. - ++TotalInsts; - - // We can never unswitch on vector conditions. - if (Cond->getType()->isVectorTy()) - return nullptr; - - // Constants should be folded, not unswitched on! - if (isa<Constant>(Cond)) return nullptr; - - // TODO: Handle: br (VARIANT|INVARIANT). - - // Hoist simple values out. - if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) { - Cache[Cond] = Cond; - return Cond; - } - - // Walk up the operator chain to find partial invariant conditions. - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) - if (BO->getOpcode() == Instruction::And || - BO->getOpcode() == Instruction::Or) { - // Given the previous operator, compute the current operator chain status. - OperatorChain NewChain; - switch (ParentChain) { - case OC_OpChainNone: - NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : - OC_OpChainOr; - break; - case OC_OpChainOr: - NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr : - OC_OpChainMixed; - break; - case OC_OpChainAnd: - NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : - OC_OpChainMixed; - break; - case OC_OpChainMixed: - NewChain = OC_OpChainMixed; - break; - } - - // If we reach a Mixed state, we do not want to keep walking up as we can not - // reliably find a value that will simplify the chain. With this check, we - // will return null on the first sight of mixed chain and the caller will - // either backtrack to find partial LIV in other operand or return null. - if (NewChain != OC_OpChainMixed) { - // Update the current operator chain type before we search up the chain. - ParentChain = NewChain; - // If either the left or right side is invariant, we can unswitch on this, - // which will cause the branch to go away in one loop and the condition to - // simplify in the other one. - if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed, - ParentChain, Cache, MSSAU)) { - Cache[Cond] = LHS; - return LHS; - } - // We did not manage to find a partial LIV in operand(0). Backtrack and try - // operand(1). - ParentChain = NewChain; - if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed, - ParentChain, Cache, MSSAU)) { - Cache[Cond] = RHS; - return RHS; - } - } - } - - Cache[Cond] = nullptr; - return nullptr; -} - -/// Cond is a condition that occurs in L. If it is invariant in the loop, or has -/// an invariant piece, return the invariant along with the operator chain type. -/// Otherwise, return null. -static std::pair<Value *, OperatorChain> -findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, - MemorySSAUpdater *MSSAU) { - DenseMap<Value *, Value *> Cache; - OperatorChain OpChain = OC_OpChainNone; - Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU); - - // In case we do find a LIV, it can not be obtained by walking up a mixed - // operator chain. - assert((!FCond || OpChain != OC_OpChainMixed) && - "Do not expect a partial LIV with mixed operator chain"); - return {FCond, OpChain}; -} - -bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { - if (skipLoop(L)) - return false; - - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *L->getHeader()->getParent()); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - LPM = &LPMRef; - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); - CurrentLoop = L; - Function *F = CurrentLoop->getHeader()->getParent(); - - SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory); - if (SanitizeMemory) - SafetyInfo.computeLoopSafetyInfo(L); - - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - - bool Changed = false; - do { - assert(CurrentLoop->isLCSSAForm(*DT)); - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - RedoLoop = false; - Changed |= processCurrentLoop(); - } while (RedoLoop); - - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - - return Changed; -} - -// Return true if the BasicBlock BB is unreachable from the loop header. -// Return false, otherwise. -bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) { - auto *Node = DT->getNode(BB)->getIDom(); - BasicBlock *DomBB = Node->getBlock(); - while (CurrentLoop->contains(DomBB)) { - BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator()); - - Node = DT->getNode(DomBB)->getIDom(); - DomBB = Node->getBlock(); - - if (!BInst || !BInst->isConditional()) - continue; - - Value *Cond = BInst->getCondition(); - if (!isa<ConstantInt>(Cond)) - continue; - - BasicBlock *UnreachableSucc = - Cond == ConstantInt::getTrue(Cond->getContext()) - ? BInst->getSuccessor(1) - : BInst->getSuccessor(0); - - if (DT->dominates(UnreachableSucc, BB)) - return true; - } - return false; -} - -/// FIXME: Remove this workaround when freeze related patches are done. -/// LoopUnswitch and Equality propagation in GVN have discrepancy about -/// whether branch on undef/poison has undefine behavior. Here it is to -/// rule out some common cases that we found such discrepancy already -/// causing problems. Detail could be found in PR31652. Note if the -/// func returns true, it is unsafe. But if it is false, it doesn't mean -/// it is necessarily safe. -static bool equalityPropUnSafe(Value &LoopCond) { - ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond); - if (!CI || !CI->isEquality()) - return false; - - Value *LHS = CI->getOperand(0); - Value *RHS = CI->getOperand(1); - if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) - return true; - - auto HasUndefInPHI = [](PHINode &PN) { - for (Value *Opd : PN.incoming_values()) { - if (isa<UndefValue>(Opd)) - return true; - } - return false; - }; - PHINode *LPHI = dyn_cast<PHINode>(LHS); - PHINode *RPHI = dyn_cast<PHINode>(RHS); - if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI))) - return true; - - auto HasUndefInSelect = [](SelectInst &SI) { - if (isa<UndefValue>(SI.getTrueValue()) || - isa<UndefValue>(SI.getFalseValue())) - return true; - return false; - }; - SelectInst *LSI = dyn_cast<SelectInst>(LHS); - SelectInst *RSI = dyn_cast<SelectInst>(RHS); - if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI))) - return true; - return false; -} - -/// Do actual work and unswitch loop if possible and profitable. -bool LoopUnswitch::processCurrentLoop() { - bool Changed = false; - - initLoopData(); - - // If LoopSimplify was unable to form a preheader, don't do any unswitching. - if (!LoopPreheader) - return false; - - // Loops with indirectbr cannot be cloned. - if (!CurrentLoop->isSafeToClone()) - return false; - - // Without dedicated exits, splitting the exit edge may fail. - if (!CurrentLoop->hasDedicatedExits()) - return false; - - LLVMContext &Context = LoopHeader->getContext(); - - // Analyze loop cost, and stop unswitching if loop content can not be duplicated. - if (!BranchesInfo.countLoop( - CurrentLoop, - getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurrentLoop->getHeader()->getParent()), - AC)) - return false; - - // Try trivial unswitch first before loop over other basic blocks in the loop. - if (tryTrivialLoopUnswitch(Changed)) { - return true; - } - - // Do not do non-trivial unswitch while optimizing for size. - // FIXME: Use Function::hasOptSize(). - if (OptimizeForSize || - LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) - return Changed; - - // Run through the instructions in the loop, keeping track of three things: - // - // - That we do not unswitch loops containing convergent operations, as we - // might be making them control dependent on the unswitch value when they - // were not before. - // FIXME: This could be refined to only bail if the convergent operation is - // not already control-dependent on the unswitch value. - // - // - That basic blocks in the loop contain invokes whose predecessor edges we - // cannot split. - // - // - The set of guard intrinsics encountered (these are non terminator - // instructions that are also profitable to be unswitched). - - SmallVector<IntrinsicInst *, 4> Guards; - - for (const auto BB : CurrentLoop->blocks()) { - for (auto &I : *BB) { - auto *CB = dyn_cast<CallBase>(&I); - if (!CB) - continue; - if (CB->isConvergent()) - return Changed; - if (auto *II = dyn_cast<InvokeInst>(&I)) - if (!II->getUnwindDest()->canSplitPredecessors()) - return Changed; - if (auto *II = dyn_cast<IntrinsicInst>(&I)) - if (II->getIntrinsicID() == Intrinsic::experimental_guard) - Guards.push_back(II); - } - } - - for (IntrinsicInst *Guard : Guards) { - Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop, - Changed, MSSAU.get()) - .first; - if (LoopCond && - unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { - // NB! Unswitching (if successful) could have erased some of the - // instructions in Guards leaving dangling pointers there. This is fine - // because we're returning now, and won't look at Guards again. - ++NumGuards; - return true; - } - } - - // Loop over all of the basic blocks in the loop. If we find an interior - // block that is branching on a loop-invariant condition, we can unswitch this - // loop. - for (Loop::block_iterator I = CurrentLoop->block_begin(), - E = CurrentLoop->block_end(); - I != E; ++I) { - Instruction *TI = (*I)->getTerminator(); - - // Unswitching on a potentially uninitialized predicate is not - // MSan-friendly. Limit this to the cases when the original predicate is - // guaranteed to execute, to avoid creating a use-of-uninitialized-value - // in the code that did not have one. - // This is a workaround for the discrepancy between LLVM IR and MSan - // semantics. See PR28054 for more details. - if (SanitizeMemory && - !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop)) - continue; - - if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { - // Some branches may be rendered unreachable because of previous - // unswitching. - // Unswitch only those branches that are reachable. - if (isUnreachableDueToPreviousUnswitching(*I)) - continue; - - // If this isn't branching on an invariant condition, we can't unswitch - // it. - if (BI->isConditional()) { - // See if this, or some part of it, is loop invariant. If so, we can - // unswitch on it if we desire. - Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop, - Changed, MSSAU.get()) - .first; - if (LoopCond && !equalityPropUnSafe(*LoopCond) && - unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { - ++NumBranches; - return true; - } - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { - Value *SC = SI->getCondition(); - Value *LoopCond; - OperatorChain OpChain; - std::tie(LoopCond, OpChain) = - findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get()); - - unsigned NumCases = SI->getNumCases(); - if (LoopCond && NumCases) { - // Find a value to unswitch on: - // FIXME: this should chose the most expensive case! - // FIXME: scan for a case with a non-critical edge? - Constant *UnswitchVal = nullptr; - // Find a case value such that at least one case value is unswitched - // out. - if (OpChain == OC_OpChainAnd) { - // If the chain only has ANDs and the switch has a case value of 0. - // Dropping in a 0 to the chain will unswitch out the 0-casevalue. - auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType())); - if (BranchesInfo.isUnswitched(SI, AllZero)) - continue; - // We are unswitching 0 out. - UnswitchVal = AllZero; - } else if (OpChain == OC_OpChainOr) { - // If the chain only has ORs and the switch has a case value of ~0. - // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue. - auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType())); - if (BranchesInfo.isUnswitched(SI, AllOne)) - continue; - // We are unswitching ~0 out. - UnswitchVal = AllOne; - } else { - assert(OpChain == OC_OpChainNone && - "Expect to unswitch on trivial chain"); - // Do not process same value again and again. - // At this point we have some cases already unswitched and - // some not yet unswitched. Let's find the first not yet unswitched one. - for (auto Case : SI->cases()) { - Constant *UnswitchValCandidate = Case.getCaseValue(); - if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { - UnswitchVal = UnswitchValCandidate; - break; - } - } - } - - if (!UnswitchVal) - continue; - - if (unswitchIfProfitable(LoopCond, UnswitchVal)) { - ++NumSwitches; - // In case of a full LIV, UnswitchVal is the value we unswitched out. - // In case of a partial LIV, we only unswitch when its an AND-chain - // or OR-chain. In both cases switch input value simplifies to - // UnswitchVal. - BranchesInfo.setUnswitched(SI, UnswitchVal); - return true; - } - } - } - - // Scan the instructions to check for unswitchable values. - for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); - BBI != E; ++BBI) - if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { - Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop, - Changed, MSSAU.get()) - .first; - if (LoopCond && - unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { - ++NumSelects; - return true; - } - } - } - - // Check if there is a header condition that is invariant along the patch from - // either the true or false successors to the header. This allows unswitching - // conditions depending on memory accesses, if there's a path not clobbering - // the memory locations. Check if this transform has been disabled using - // metadata, to avoid unswitching the same loop multiple times. - if (MSSA && - !findOptionMDForLoop(CurrentLoop, "llvm.loop.unswitch.partial.disable")) { - if (auto Info = - hasPartialIVCondition(*CurrentLoop, MSSAThreshold, *MSSA, *AA)) { - assert(!Info->InstToDuplicate.empty() && - "need at least a partially invariant condition"); - LLVM_DEBUG(dbgs() << "loop-unswitch: Found partially invariant condition " - << *Info->InstToDuplicate[0] << "\n"); - - Instruction *TI = CurrentLoop->getHeader()->getTerminator(); - Value *LoopCond = Info->InstToDuplicate[0]; - - // If the partially unswitched path is a no-op and has a single exit - // block, we do not need to do full unswitching. Instead, we can directly - // branch to the exit. - // TODO: Instead of duplicating the checks, we could also just directly - // branch to the exit from the conditional branch in the loop. - if (Info->PathIsNoop) { - if (HasBranchDivergence && - getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) { - LLVM_DEBUG(dbgs() << "NOT unswitching loop %" - << CurrentLoop->getHeader()->getName() - << " at non-trivial condition '" - << *Info->KnownValue << "' == " << *LoopCond << "\n" - << ". Condition is divergent.\n"); - return false; - } - - ++NumBranches; - - BasicBlock *TrueDest = LoopHeader; - BasicBlock *FalseDest = Info->ExitForPath; - if (Info->KnownValue->isOneValue()) - std::swap(TrueDest, FalseDest); - - auto *OldBr = - cast<BranchInst>(CurrentLoop->getLoopPreheader()->getTerminator()); - emitPreheaderBranchOnCondition(LoopCond, Info->KnownValue, TrueDest, - FalseDest, OldBr, TI, - Info->InstToDuplicate); - delete OldBr; - RedoLoop = false; - return true; - } - - // Otherwise, the path is not a no-op. Run regular unswitching. - if (unswitchIfProfitable(LoopCond, Info->KnownValue, - CurrentLoop->getHeader()->getTerminator(), - Info->InstToDuplicate)) { - ++NumBranches; - RedoLoop = false; - return true; - } - } - } - - return Changed; -} - -/// Check to see if all paths from BB exit the loop with no side effects -/// (including infinite loops). -/// -/// If true, we return true and set ExitBB to the block we -/// exit through. -/// -static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, - BasicBlock *&ExitBB, - std::set<BasicBlock*> &Visited) { - if (!Visited.insert(BB).second) { - // Already visited. Without more analysis, this could indicate an infinite - // loop. - return false; - } - if (!L->contains(BB)) { - // Otherwise, this is a loop exit, this is fine so long as this is the - // first exit. - if (ExitBB) return false; - ExitBB = BB; - return true; - } - - // Otherwise, this is an unvisited intra-loop node. Check all successors. - for (BasicBlock *Succ : successors(BB)) { - // Check to see if the successor is a trivial loop exit. - if (!isTrivialLoopExitBlockHelper(L, Succ, ExitBB, Visited)) - return false; - } - - // Okay, everything after this looks good, check to make sure that this block - // doesn't include any side effects. - for (Instruction &I : *BB) - if (I.mayHaveSideEffects()) - return false; - - return true; -} - -/// Return true if the specified block unconditionally leads to an exit from -/// the specified loop, and has no side-effects in the process. If so, return -/// the block that is exited to, otherwise return null. -static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { - std::set<BasicBlock*> Visited; - Visited.insert(L->getHeader()); // Branches to header make infinite loops. - BasicBlock *ExitBB = nullptr; - if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) - return ExitBB; - return nullptr; -} - -/// We have found that we can unswitch CurrentLoop when LoopCond == Val to -/// simplify the loop. If we decide that this is profitable, -/// unswitch the loop, reprocess the pieces, then return true. -bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val, - Instruction *TI, - ArrayRef<Instruction *> ToDuplicate) { - // Check to see if it would be profitable to unswitch current loop. - if (!BranchesInfo.costAllowsUnswitching()) { - LLVM_DEBUG(dbgs() << "NOT unswitching loop %" - << CurrentLoop->getHeader()->getName() - << " at non-trivial condition '" << *Val - << "' == " << *LoopCond << "\n" - << ". Cost too high.\n"); - return false; - } - if (HasBranchDivergence && - getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) { - LLVM_DEBUG(dbgs() << "NOT unswitching loop %" - << CurrentLoop->getHeader()->getName() - << " at non-trivial condition '" << *Val - << "' == " << *LoopCond << "\n" - << ". Condition is divergent.\n"); - return false; - } - - unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate); - return true; -} - -/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, -/// otherwise branch to FalseDest. Insert the code immediately before OldBranch -/// and remove (but not erase!) it from the function. -void LoopUnswitch::emitPreheaderBranchOnCondition( - Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, - BranchInst *OldBranch, Instruction *TI, - ArrayRef<Instruction *> ToDuplicate) { - assert(OldBranch->isUnconditional() && "Preheader is not split correctly"); - assert(TrueDest != FalseDest && "Branch targets should be different"); - - // Insert a conditional branch on LIC to the two preheaders. The original - // code is the true version and the new code is the false version. - Value *BranchVal = LIC; - bool Swapped = false; - - if (!ToDuplicate.empty()) { - ValueToValueMapTy Old2New; - for (Instruction *I : reverse(ToDuplicate)) { - auto *New = I->clone(); - New->insertBefore(OldBranch); - RemapInstruction(New, Old2New, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - Old2New[I] = New; - - if (MSSAU) { - MemorySSA *MSSA = MSSAU->getMemorySSA(); - auto *MemA = dyn_cast_or_null<MemoryUse>(MSSA->getMemoryAccess(I)); - if (!MemA) - continue; - - Loop *L = LI->getLoopFor(I->getParent()); - auto *DefiningAccess = MemA->getDefiningAccess(); - // Get the first defining access before the loop. - while (L->contains(DefiningAccess->getBlock())) { - // If the defining access is a MemoryPhi, get the incoming - // value for the pre-header as defining access. - if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) { - DefiningAccess = - MemPhi->getIncomingValueForBlock(L->getLoopPreheader()); - } else { - DefiningAccess = - cast<MemoryDef>(DefiningAccess)->getDefiningAccess(); - } - } - MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(), - MemorySSA::BeforeTerminator); - } - } - BranchVal = Old2New[ToDuplicate[0]]; - } else { - - if (!isa<ConstantInt>(Val) || - Val->getType() != Type::getInt1Ty(LIC->getContext())) - BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val); - else if (Val != ConstantInt::getTrue(Val->getContext())) { - // We want to enter the new loop when the condition is true. - std::swap(TrueDest, FalseDest); - Swapped = true; - } - } - - // Old branch will be removed, so save its parent and successor to update the - // DomTree. - auto *OldBranchSucc = OldBranch->getSuccessor(0); - auto *OldBranchParent = OldBranch->getParent(); - - // Insert the new branch. - BranchInst *BI = - IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI); - if (Swapped) - BI->swapProfMetadata(); - - // Remove the old branch so there is only one branch at the end. This is - // needed to perform DomTree's internal DFS walk on the function's CFG. - OldBranch->removeFromParent(); - - // Inform the DT about the new branch. - if (DT) { - // First, add both successors. - SmallVector<DominatorTree::UpdateType, 3> Updates; - if (TrueDest != OldBranchSucc) - Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest}); - if (FalseDest != OldBranchSucc) - Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest}); - // If both of the new successors are different from the old one, inform the - // DT that the edge was deleted. - if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) { - Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc}); - } - - if (MSSAU) - MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true); - else - DT->applyUpdates(Updates); - } - - // If either edge is critical, split it. This helps preserve LoopSimplify - // form for enclosing loops. - auto Options = - CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA(); - SplitCriticalEdge(BI, 0, Options); - SplitCriticalEdge(BI, 1, Options); -} - -/// Given a loop that has a trivial unswitchable condition in it (a cond branch -/// from its header block to its latch block, where the path through the loop -/// that doesn't execute its body has no side-effects), unswitch it. This -/// doesn't involve any code duplication, just moving the conditional branch -/// outside of the loop and updating loop info. -void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, - BasicBlock *ExitBlock, - Instruction *TI) { - LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" - << LoopHeader->getName() << " [" << L->getBlocks().size() - << " blocks] in Function " - << L->getHeader()->getParent()->getName() - << " on cond: " << *Val << " == " << *Cond << "\n"); - // We are going to make essential changes to CFG. This may invalidate cached - // information for L or one of its parent loops in SCEV. - if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) - SEWP->getSE().forgetTopmostLoop(L); - - // First step, split the preheader, so that we know that there is a safe place - // to insert the conditional branch. We will change LoopPreheader to have a - // conditional branch on Cond. - BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get()); - - // Now that we have a place to insert the conditional branch, create a place - // to branch to: this is the exit block out of the loop that we should - // short-circuit to. - - // Split this block now, so that the loop maintains its exit block, and so - // that the jump from the preheader can execute the contents of the exit block - // without actually branching to it (the exit block should be dominated by the - // loop header, not the preheader). - assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = - SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get()); - - // Okay, now we have a position to branch from and a position to branch to, - // insert the new conditional branch. - auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator()); - assert(OldBranch && "Failed to split the preheader"); - emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI); - - // emitPreheaderBranchOnCondition removed the OldBranch from the function. - // Delete it, as it is no longer needed. - delete OldBranch; - - // We need to reprocess this loop, it could be unswitched again. - RedoLoop = true; - - // Now that we know that the loop is never entered when this condition is a - // particular value, rewrite the loop with this info. We know that this will - // at least eliminate the old branch. - rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false); - - ++NumTrivial; -} - -/// Check if the first non-constant condition starting from the loop header is -/// a trivial unswitch condition: that is, a condition controls whether or not -/// the loop does anything at all. If it is a trivial condition, unswitching -/// produces no code duplications (equivalently, it produces a simpler loop and -/// a new empty loop, which gets deleted). Therefore always unswitch trivial -/// condition. -bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) { - BasicBlock *CurrentBB = CurrentLoop->getHeader(); - Instruction *CurrentTerm = CurrentBB->getTerminator(); - LLVMContext &Context = CurrentBB->getContext(); - - // If loop header has only one reachable successor (currently via an - // unconditional branch or constant foldable conditional branch, but - // should also consider adding constant foldable switch instruction in - // future), we should keep looking for trivial condition candidates in - // the successor as well. An alternative is to constant fold conditions - // and merge successors into loop header (then we only need to check header's - // terminator). The reason for not doing this in LoopUnswitch pass is that - // it could potentially break LoopPassManager's invariants. Folding dead - // branches could either eliminate the current loop or make other loops - // unreachable. LCSSA form might also not be preserved after deleting - // branches. The following code keeps traversing loop header's successors - // until it finds the trivial condition candidate (condition that is not a - // constant). Since unswitching generates branches with constant conditions, - // this scenario could be very common in practice. - SmallPtrSet<BasicBlock*, 8> Visited; - - while (true) { - // If we exit loop or reach a previous visited block, then - // we can not reach any trivial condition candidates (unfoldable - // branch instructions or switch instructions) and no unswitch - // can happen. Exit and return false. - if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) - return false; - - // Check if this loop will execute any side-effecting instructions (e.g. - // stores, calls, volatile loads) in the part of the loop that the code - // *would* execute. Check the header first. - for (Instruction &I : *CurrentBB) - if (I.mayHaveSideEffects()) - return false; - - if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { - if (BI->isUnconditional()) { - CurrentBB = BI->getSuccessor(0); - } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { - CurrentBB = BI->getSuccessor(0); - } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { - CurrentBB = BI->getSuccessor(1); - } else { - // Found a trivial condition candidate: non-foldable conditional branch. - break; - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { - // At this point, any constant-foldable instructions should have probably - // been folded. - ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); - if (!Cond) - break; - // Find the target block we are definitely going to. - CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor(); - } else { - // We do not understand these terminator instructions. - break; - } - - CurrentTerm = CurrentBB->getTerminator(); - } - - // CondVal is the condition that controls the trivial condition. - // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. - Constant *CondVal = nullptr; - BasicBlock *LoopExitBB = nullptr; - - if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { - // If this isn't branching on an invariant condition, we can't unswitch it. - if (!BI->isConditional()) - return false; - - Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop, - Changed, MSSAU.get()) - .first; - - // Unswitch only if the trivial condition itself is an LIV (not - // partial LIV which could occur in and/or) - if (!LoopCond || LoopCond != BI->getCondition()) - return false; - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes - // it to do this. - if ((LoopExitBB = - isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) { - CondVal = ConstantInt::getTrue(Context); - } else if ((LoopExitBB = - isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) { - CondVal = ConstantInt::getFalse(Context); - } - - // If we didn't find a single unique LoopExit block, or if the loop exit - // block contains phi nodes, this isn't trivial. - if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) - return false; // Can't handle this. - - if (equalityPropUnSafe(*LoopCond)) - return false; - - unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB, - CurrentTerm); - ++NumBranches; - return true; - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { - // If this isn't switching on an invariant condition, we can't unswitch it. - Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop, - Changed, MSSAU.get()) - .first; - - // Unswitch only if the trivial condition itself is an LIV (not - // partial LIV which could occur in and/or) - if (!LoopCond || LoopCond != SI->getCondition()) - return false; - - // Check to see if a successor of the switch is guaranteed to go to the - // latch block or exit through a one exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - // Note that we can't trivially unswitch on the default case or - // on already unswitched cases. - for (auto Case : SI->cases()) { - BasicBlock *LoopExitCandidate; - if ((LoopExitCandidate = - isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) { - // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt *CaseVal = Case.getCaseValue(); - - // Check that it was not unswitched before, since already unswitched - // trivial vals are looks trivial too. - if (BranchesInfo.isUnswitched(SI, CaseVal)) - continue; - LoopExitBB = LoopExitCandidate; - CondVal = CaseVal; - break; - } - } - - // If we didn't find a single unique LoopExit block, or if the loop exit - // block contains phi nodes, this isn't trivial. - if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) - return false; // Can't handle this. - - unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB, - nullptr); - - // We are only unswitching full LIV. - BranchesInfo.setUnswitched(SI, CondVal); - ++NumSwitches; - return true; - } - return false; -} - -/// Split all of the edges from inside the loop to their exit blocks. -/// Update the appropriate Phi nodes as we do so. -void LoopUnswitch::splitExitEdges( - Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) { - - for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) { - BasicBlock *ExitBlock = ExitBlocks[I]; - SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBlock)); - - // Although SplitBlockPredecessors doesn't preserve loop-simplify in - // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(), - /*PreserveLCSSA*/ true); - } -} - -/// We determined that the loop is profitable to unswitch when LIC equal Val. -/// Split it into loop versions and test the condition outside of either loop. -/// Return the loops created as Out1/Out2. -void LoopUnswitch::unswitchNontrivialCondition( - Value *LIC, Constant *Val, Loop *L, Instruction *TI, - ArrayRef<Instruction *> ToDuplicate) { - Function *F = LoopHeader->getParent(); - LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" - << LoopHeader->getName() << " [" << L->getBlocks().size() - << " blocks] in Function " << F->getName() << " when '" - << *Val << "' == " << *LIC << "\n"); - - // We are going to make essential changes to CFG. This may invalidate cached - // information for L or one of its parent loops in SCEV. - if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) - SEWP->getSE().forgetTopmostLoop(L); - - LoopBlocks.clear(); - NewBlocks.clear(); - - if (MSSAU && VerifyMemorySSA) - MSSA->verifyMemorySSA(); - - // First step, split the preheader and exit blocks, and add these blocks to - // the LoopBlocks list. - BasicBlock *NewPreheader = - SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get()); - LoopBlocks.push_back(NewPreheader); - - // We want the loop to come after the preheader, but before the exit blocks. - llvm::append_range(LoopBlocks, L->blocks()); - - SmallVector<BasicBlock*, 8> ExitBlocks; - L->getUniqueExitBlocks(ExitBlocks); - - // Split all of the edges from inside the loop to their exit blocks. Update - // the appropriate Phi nodes as we do so. - splitExitEdges(L, ExitBlocks); - - // The exit blocks may have been changed due to edge splitting, recompute. - ExitBlocks.clear(); - L->getUniqueExitBlocks(ExitBlocks); - - // Add exit blocks to the loop blocks. - llvm::append_range(LoopBlocks, ExitBlocks); - - // Next step, clone all of the basic blocks that make up the loop (including - // the loop preheader and exit blocks), keeping track of the mapping between - // the instructions and blocks. - NewBlocks.reserve(LoopBlocks.size()); - ValueToValueMapTy VMap; - for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) { - BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F); - - NewBlocks.push_back(NewBB); - VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping. - } - - // Splice the newly inserted blocks into the function right before the - // original preheader. - F->getBasicBlockList().splice(NewPreheader->getIterator(), - F->getBasicBlockList(), - NewBlocks[0]->getIterator(), F->end()); - - // Now we create the new Loop object for the versioned loop. - Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM); - - // Recalculate unswitching quota, inherit simplified switches info for NewBB, - // Probably clone more loop-unswitch related loop properties. - BranchesInfo.cloneData(NewLoop, L, VMap); - - Loop *ParentLoop = L->getParentLoop(); - if (ParentLoop) { - // Make sure to add the cloned preheader and exit blocks to the parent loop - // as well. - ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI); - } - - for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) { - BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]); - // The new exit block should be in the same loop as the old one. - if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI])) - ExitBBLoop->addBasicBlockToLoop(NewExit, *LI); - - assert(NewExit->getTerminator()->getNumSuccessors() == 1 && - "Exit block should have been split to have one successor!"); - BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0); - - // If the successor of the exit block had PHI nodes, add an entry for - // NewExit. - for (PHINode &PN : ExitSucc->phis()) { - Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]); - ValueToValueMapTy::iterator It = VMap.find(V); - if (It != VMap.end()) V = It->second; - PN.addIncoming(V, NewExit); - } - - if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { - PHINode *PN = PHINode::Create(LPad->getType(), 0, "", - &*ExitSucc->getFirstInsertionPt()); - - for (BasicBlock *BB : predecessors(ExitSucc)) { - LandingPadInst *LPI = BB->getLandingPadInst(); - LPI->replaceAllUsesWith(PN); - PN->addIncoming(LPI, BB); - } - } - } - - // Rewrite the code to refer to itself. - for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) { - for (Instruction &I : *NewBlocks[NBI]) { - RemapInstruction(&I, VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - if (auto *II = dyn_cast<AssumeInst>(&I)) - AC->registerAssumption(II); - } - } - - // Rewrite the original preheader to select between versions of the loop. - BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator()); - assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] && - "Preheader splitting did not work correctly!"); - - if (MSSAU) { - // Update MemorySSA after cloning, and before splitting to unreachables, - // since that invalidates the 1:1 mapping of clones in VMap. - LoopBlocksRPO LBRPO(L); - LBRPO.perform(LI); - MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap); - } - - // Emit the new branch that selects between the two versions of this loop. - emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR, - TI, ToDuplicate); - if (MSSAU) { - // Update MemoryPhis in Exit blocks. - MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT); - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - } - - // The OldBr was replaced by a new one and removed (but not erased) by - // emitPreheaderBranchOnCondition. It is no longer needed, so delete it. - delete OldBR; - - LoopProcessWorklist.push_back(NewLoop); - RedoLoop = true; - - // Keep a WeakTrackingVH holding onto LIC. If the first call to - // RewriteLoopBody - // deletes the instruction (for example by simplifying a PHI that feeds into - // the condition that we're unswitching on), we don't rewrite the second - // iteration. - WeakTrackingVH LICHandle(LIC); - - if (ToDuplicate.empty()) { - // Now we rewrite the original code to know that the condition is true and - // the new code to know that the condition is false. - rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false); - - // It's possible that simplifying one loop could cause the other to be - // changed to another value or a constant. If its a constant, don't - // simplify it. - if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop && - LICHandle && !isa<Constant>(LICHandle)) - rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, - /*IsEqual=*/true); - } else { - // Partial unswitching. Update the condition in the right loop with the - // constant. - auto *CC = cast<ConstantInt>(Val); - if (CC->isOneValue()) { - rewriteLoopBodyWithConditionConstant(NewLoop, VMap[LIC], Val, - /*IsEqual=*/true); - } else - rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true); - - // Mark the new loop as partially unswitched, to avoid unswitching on the - // same condition again. - auto &Context = NewLoop->getHeader()->getContext(); - MDNode *DisableUnswitchMD = MDNode::get( - Context, MDString::get(Context, "llvm.loop.unswitch.partial.disable")); - MDNode *NewLoopID = makePostTransformationMetadata( - Context, L->getLoopID(), {"llvm.loop.unswitch.partial"}, - {DisableUnswitchMD}); - NewLoop->setLoopID(NewLoopID); - } - - if (MSSA && VerifyMemorySSA) - MSSA->verifyMemorySSA(); -} - -/// Remove all instances of I from the worklist vector specified. -static void removeFromWorklist(Instruction *I, - std::vector<Instruction *> &Worklist) { - llvm::erase_value(Worklist, I); -} - -/// When we find that I really equals V, remove I from the -/// program, replacing all uses with V and update the worklist. -static void replaceUsesOfWith(Instruction *I, Value *V, - std::vector<Instruction *> &Worklist, Loop *L, - LPPassManager *LPM, MemorySSAUpdater *MSSAU) { - LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n"); - - // Add uses to the worklist, which may be dead now. - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) - if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) - Worklist.push_back(Use); - - // Add users to the worklist which may be simplified now. - for (User *U : I->users()) - Worklist.push_back(cast<Instruction>(U)); - removeFromWorklist(I, Worklist); - I->replaceAllUsesWith(V); - if (!I->mayHaveSideEffects()) { - if (MSSAU) - MSSAU->removeMemoryAccess(I); - I->eraseFromParent(); - } - ++NumSimplify; -} - -/// We know either that the value LIC has the value specified by Val in the -/// specified loop, or we know it does NOT have that value. -/// Rewrite any uses of LIC or of properties correlated to it. -void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, - Constant *Val, - bool IsEqual) { - assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); - - // FIXME: Support correlated properties, like: - // for (...) - // if (li1 < li2) - // ... - // if (li1 > li2) - // ... - - // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches, - // selects, switches. - std::vector<Instruction*> Worklist; - LLVMContext &Context = Val->getContext(); - - // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC - // in the loop with the appropriate one directly. - if (IsEqual || (isa<ConstantInt>(Val) && - Val->getType()->isIntegerTy(1))) { - Value *Replacement; - if (IsEqual) - Replacement = Val; - else - Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), - !cast<ConstantInt>(Val)->getZExtValue()); - - for (User *U : LIC->users()) { - Instruction *UI = dyn_cast<Instruction>(U); - if (!UI || !L->contains(UI)) - continue; - Worklist.push_back(UI); - } - - for (Instruction *UI : Worklist) - UI->replaceUsesOfWith(LIC, Replacement); - - simplifyCode(Worklist, L); - return; - } - - // Otherwise, we don't know the precise value of LIC, but we do know that it - // is certainly NOT "Val". As such, simplify any uses in the loop that we - // can. This case occurs when we unswitch switch statements. - for (User *U : LIC->users()) { - Instruction *UI = dyn_cast<Instruction>(U); - if (!UI || !L->contains(UI)) - continue; - - // At this point, we know LIC is definitely not Val. Try to use some simple - // logic to simplify the user w.r.t. to the context. - if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) { - if (LI->replacementPreservesLCSSAForm(UI, Replacement)) { - // This in-loop instruction has been simplified w.r.t. its context, - // i.e. LIC != Val, make sure we propagate its replacement value to - // all its users. - // - // We can not yet delete UI, the LIC user, yet, because that would invalidate - // the LIC->users() iterator !. However, we can make this instruction - // dead by replacing all its users and push it onto the worklist so that - // it can be properly deleted and its operands simplified. - UI->replaceAllUsesWith(Replacement); - } - } - - // This is a LIC user, push it into the worklist so that simplifyCode can - // attempt to simplify it. - Worklist.push_back(UI); - - // If we know that LIC is not Val, use this info to simplify code. - SwitchInst *SI = dyn_cast<SwitchInst>(UI); - if (!SI || !isa<ConstantInt>(Val)) continue; - - // NOTE: if a case value for the switch is unswitched out, we record it - // after the unswitch finishes. We can not record it here as the switch - // is not a direct user of the partial LIV. - SwitchInst::CaseHandle DeadCase = - *SI->findCaseValue(cast<ConstantInt>(Val)); - // Default case is live for multiple values. - if (DeadCase == *SI->case_default()) - continue; - - // Found a dead case value. Don't remove PHI nodes in the - // successor if they become single-entry, those PHI nodes may - // be in the Users list. - - BasicBlock *Switch = SI->getParent(); - BasicBlock *SISucc = DeadCase.getCaseSuccessor(); - BasicBlock *Latch = L->getLoopLatch(); - - if (!SI->findCaseDest(SISucc)) continue; // Edge is critical. - // If the DeadCase successor dominates the loop latch, then the - // transformation isn't safe since it will delete the sole predecessor edge - // to the latch. - if (Latch && DT->dominates(SISucc, Latch)) - continue; - - // FIXME: This is a hack. We need to keep the successor around - // and hooked up so as to preserve the loop structure, because - // trying to update it is complicated. So instead we preserve the - // loop structure and put the block on a dead code path. - SplitEdge(Switch, SISucc, DT, LI, MSSAU.get()); - // Compute the successors instead of relying on the return value - // of SplitEdge, since it may have split the switch successor - // after PHI nodes. - BasicBlock *NewSISucc = DeadCase.getCaseSuccessor(); - BasicBlock *OldSISucc = *succ_begin(NewSISucc); - // Create an "unreachable" destination. - BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable", - Switch->getParent(), - OldSISucc); - new UnreachableInst(Context, Abort); - // Force the new case destination to branch to the "unreachable" - // block while maintaining a (dead) CFG edge to the old block. - NewSISucc->getTerminator()->eraseFromParent(); - BranchInst::Create(Abort, OldSISucc, - ConstantInt::getTrue(Context), NewSISucc); - // Release the PHI operands for this edge. - for (PHINode &PN : NewSISucc->phis()) - PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType())); - // Tell the domtree about the new block. We don't fully update the - // domtree here -- instead we force it to do a full recomputation - // after the pass is complete -- but we do need to inform it of - // new blocks. - DT->addNewBlock(Abort, NewSISucc); - } - - simplifyCode(Worklist, L); -} - -/// Now that we have simplified some instructions in the loop, walk over it and -/// constant prop, dce, and fold control flow where possible. Note that this is -/// effectively a very simple loop-structure-aware optimizer. During processing -/// of this loop, L could very well be deleted, so it must not be used. -/// -/// FIXME: When the loop optimizer is more mature, separate this out to a new -/// pass. -/// -void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) { - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - while (!Worklist.empty()) { - Instruction *I = Worklist.back(); - Worklist.pop_back(); - - // Simple DCE. - if (isInstructionTriviallyDead(I)) { - LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n"); - - // Add uses to the worklist, which may be dead now. - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) - if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) - Worklist.push_back(Use); - removeFromWorklist(I, Worklist); - if (MSSAU) - MSSAU->removeMemoryAccess(I); - I->eraseFromParent(); - ++NumSimplify; - continue; - } - - // See if instruction simplification can hack this up. This is common for - // things like "select false, X, Y" after unswitching made the condition be - // 'false'. TODO: update the domtree properly so we can pass it here. - if (Value *V = SimplifyInstruction(I, DL)) - if (LI->replacementPreservesLCSSAForm(I, V)) { - replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get()); - continue; - } - - // Special case hacks that appear commonly in unswitched code. - if (BranchInst *BI = dyn_cast<BranchInst>(I)) { - if (BI->isUnconditional()) { - // If BI's parent is the only pred of the successor, fold the two blocks - // together. - BasicBlock *Pred = BI->getParent(); - (void)Pred; - BasicBlock *Succ = BI->getSuccessor(0); - BasicBlock *SinglePred = Succ->getSinglePredecessor(); - if (!SinglePred) continue; // Nothing to do. - assert(SinglePred == Pred && "CFG broken"); - - // Make the LPM and Worklist updates specific to LoopUnswitch. - removeFromWorklist(BI, Worklist); - auto SuccIt = Succ->begin(); - while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) { - for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It) - if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It))) - Worklist.push_back(Use); - for (User *U : PN->users()) - Worklist.push_back(cast<Instruction>(U)); - removeFromWorklist(PN, Worklist); - ++NumSimplify; - } - // Merge the block and make the remaining analyses updates. - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get()); - ++NumSimplify; - continue; - } - - continue; - } - } -} - -/// Simple simplifications we can do given the information that Cond is -/// definitely not equal to Val. -Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst, - Value *Invariant, - Constant *Val) { - // icmp eq cond, val -> false - ICmpInst *CI = dyn_cast<ICmpInst>(Inst); - if (CI && CI->isEquality()) { - Value *Op0 = CI->getOperand(0); - Value *Op1 = CI->getOperand(1); - if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) { - LLVMContext &Ctx = Inst->getContext(); - if (CI->getPredicate() == CmpInst::ICMP_EQ) - return ConstantInt::getFalse(Ctx); - else - return ConstantInt::getTrue(Ctx); - } - } - - // FIXME: there may be other opportunities, e.g. comparison with floating - // point, or Invariant - Val != 0, etc. - return nullptr; -} diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 2ff1e8480749..c733aa4701ed 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -70,14 +70,12 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" -#include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp index 4063e4fe0472..6aba913005d0 100644 --- a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerAtomicPass.cpp @@ -11,95 +11,17 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar/LowerAtomic.h" +#include "llvm/Transforms/Scalar/LowerAtomicPass.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LowerAtomic.h" using namespace llvm; #define DEBUG_TYPE "loweratomic" -static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { - IRBuilder<> Builder(CXI); - Value *Ptr = CXI->getPointerOperand(); - Value *Cmp = CXI->getCompareOperand(); - Value *Val = CXI->getNewValOperand(); - - LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr); - Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); - Value *Res = Builder.CreateSelect(Equal, Val, Orig); - Builder.CreateStore(Res, Ptr); - - Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0); - Res = Builder.CreateInsertValue(Res, Equal, 1); - - CXI->replaceAllUsesWith(Res); - CXI->eraseFromParent(); - return true; -} - -bool llvm::lowerAtomicRMWInst(AtomicRMWInst *RMWI) { - IRBuilder<> Builder(RMWI); - Value *Ptr = RMWI->getPointerOperand(); - Value *Val = RMWI->getValOperand(); - - LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr); - Value *Res = nullptr; - - switch (RMWI->getOperation()) { - default: llvm_unreachable("Unexpected RMW operation"); - case AtomicRMWInst::Xchg: - Res = Val; - break; - case AtomicRMWInst::Add: - Res = Builder.CreateAdd(Orig, Val); - break; - case AtomicRMWInst::Sub: - Res = Builder.CreateSub(Orig, Val); - break; - case AtomicRMWInst::And: - Res = Builder.CreateAnd(Orig, Val); - break; - case AtomicRMWInst::Nand: - Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val)); - break; - case AtomicRMWInst::Or: - Res = Builder.CreateOr(Orig, Val); - break; - case AtomicRMWInst::Xor: - Res = Builder.CreateXor(Orig, Val); - break; - case AtomicRMWInst::Max: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), - Val, Orig); - break; - case AtomicRMWInst::Min: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), - Orig, Val); - break; - case AtomicRMWInst::UMax: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), - Val, Orig); - break; - case AtomicRMWInst::UMin: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), - Orig, Val); - break; - case AtomicRMWInst::FAdd: - Res = Builder.CreateFAdd(Orig, Val); - break; - case AtomicRMWInst::FSub: - Res = Builder.CreateFSub(Orig, Val); - break; - } - Builder.CreateStore(Res, Ptr); - RMWI->replaceAllUsesWith(Orig); - RMWI->eraseFromParent(); - return true; -} - static bool LowerFenceInst(FenceInst *FI) { FI->eraseFromParent(); return true; @@ -121,7 +43,7 @@ static bool runOnBasicBlock(BasicBlock &BB) { if (FenceInst *FI = dyn_cast<FenceInst>(&Inst)) Changed |= LowerFenceInst(FI); else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst)) - Changed |= LowerAtomicCmpXchgInst(CXI); + Changed |= lowerAtomicCmpXchgInst(CXI); else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst)) Changed |= lowerAtomicRMWInst(RMWI); else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) { diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp index 186065db327e..47493b54a527 100644 --- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -26,11 +26,9 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -96,7 +94,7 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II, return HasDeadBlocks; } -static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI, +static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo &TLI, DominatorTree *DT) { Optional<DomTreeUpdater> DTU; if (DT) @@ -140,21 +138,21 @@ static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI, IsConstantIntrinsicsHandled++; break; case Intrinsic::objectsize: - NewValue = lowerObjectSizeCall(II, DL, TLI, true); + NewValue = lowerObjectSizeCall(II, DL, &TLI, true); ObjectSizeIntrinsicsHandled++; break; } HasDeadBlocks |= replaceConditionalBranchesOnConstant( - II, NewValue, DTU.hasValue() ? DTU.getPointer() : nullptr); + II, NewValue, DTU ? DTU.getPointer() : nullptr); } if (HasDeadBlocks) - removeUnreachableBlocks(F, DTU.hasValue() ? DTU.getPointer() : nullptr); + removeUnreachableBlocks(F, DTU ? DTU.getPointer() : nullptr); return !Worklist.empty(); } PreservedAnalyses LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) { - if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F), + if (lowerConstantIntrinsics(F, AM.getResult<TargetLibraryAnalysis>(F), AM.getCachedResult<DominatorTreeAnalysis>(F))) { PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); @@ -178,8 +176,8 @@ public: } bool runOnFunction(Function &F) override { - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); DominatorTree *DT = nullptr; if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) DT = &DTWP->getDomTree(); @@ -187,6 +185,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); } @@ -196,6 +195,7 @@ public: char LowerConstantIntrinsics::ID = 0; INITIALIZE_PASS_BEGIN(LowerConstantIntrinsics, "lower-constant-intrinsics", "Lower constant intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(LowerConstantIntrinsics, "lower-constant-intrinsics", "Lower constant intrinsics", false, false) diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index a7eb60b5e032..88fad9896c59 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -21,12 +21,11 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/MisExpect.h" using namespace llvm; @@ -101,6 +100,8 @@ static bool handleSwitchExpect(SwitchInst &SI) { uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1; Weights[Index] = LikelyBranchWeightVal; + misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true); + SI.setCondition(ArgValue); SI.setMetadata(LLVMContext::MD_prof, @@ -315,13 +316,16 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight(Fn->getIntrinsicID(), CI, 2); + SmallVector<uint32_t, 4> ExpectedWeights; if ((ExpectedValue->getZExtValue() == ValueComparedTo) == (Predicate == CmpInst::ICMP_EQ)) { Node = MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal); + ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal}; } else { Node = MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal); + ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal}; } if (CmpI) @@ -329,6 +333,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { else BSI.setCondition(ArgValue); + misexpect::checkFrontendInstrumentation(BSI, ExpectedWeights); + BSI.setMetadata(LLVMContext::MD_prof, Node); return true; @@ -409,7 +415,7 @@ public: bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); } }; -} +} // namespace char LowerExpectIntrinsic::ID = 0; INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index 45f5929e3b90..8dc037b10cc8 100644 --- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -15,7 +15,6 @@ #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/GuardUtils.h" -#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -49,9 +48,13 @@ static bool lowerGuardIntrinsic(Function &F) { return false; SmallVector<CallInst *, 8> ToLower; - for (auto &I : instructions(F)) - if (isGuard(&I)) - ToLower.push_back(cast<CallInst>(&I)); + // Traverse through the users of GuardDecl. + // This is presumably cheaper than traversing all instructions in the + // function. + for (auto *U : GuardDecl->users()) + if (auto *CI = dyn_cast<CallInst>(U)) + if (CI->getFunction() == &F) + ToLower.push_back(CI); if (ToLower.empty()) return false; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 296becb31e8f..c05906649f16 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -18,11 +18,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" -#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -704,10 +704,10 @@ public: // We may remove II. By default continue on the next/prev instruction. ++II; // If we were to erase II, move again. - auto EraseFromParent = [&II](Value *V) { + auto EraseFromParent = [&II, &BB](Value *V) { auto *Inst = cast<Instruction>(V); if (Inst->use_empty()) { - if (Inst == &*II) { + if (II != BB.rend() && Inst == &*II) { ++II; } Inst->eraseFromParent(); @@ -718,7 +718,7 @@ public: Instruction *NewInst = nullptr; IRBuilder<> IB(&I); - MatrixBuilder<IRBuilder<>> Builder(IB); + MatrixBuilder Builder(IB); Value *TA, *TAMA, *TAMB; ConstantInt *R, *K, *C; @@ -766,28 +766,25 @@ public: // If we have a TT matmul, lift the transpose. We may be able to fold into // consuming multiply. for (BasicBlock &BB : Func) { - for (BasicBlock::iterator II = BB.begin(); II != BB.end();) { - Instruction *I = &*II; - // We may remove I. - ++II; + for (Instruction &I : llvm::make_early_inc_range(BB)) { Value *A, *B, *AT, *BT; ConstantInt *R, *K, *C; // A^t * B ^t -> (B * A)^t - if (match(&*I, m_Intrinsic<Intrinsic::matrix_multiply>( - m_Value(A), m_Value(B), m_ConstantInt(R), - m_ConstantInt(K), m_ConstantInt(C))) && + if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(A), m_Value(B), m_ConstantInt(R), + m_ConstantInt(K), m_ConstantInt(C))) && match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(AT))) && match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value((BT))))) { - IRBuilder<> IB(&*I); - MatrixBuilder<IRBuilder<>> Builder(IB); + IRBuilder<> IB(&I); + MatrixBuilder Builder(IB); Value *M = Builder.CreateMatrixMultiply( BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue()); setShapeInfo(M, {C, R}); Instruction *NewInst = Builder.CreateMatrixTranspose( M, C->getZExtValue(), R->getZExtValue()); - ReplaceAllUsesWith(*I, NewInst); - if (I->use_empty()) - I->eraseFromParent(); + ReplaceAllUsesWith(I, NewInst); + if (I.use_empty()) + I.eraseFromParent(); if (A->use_empty()) cast<Instruction>(A)->eraseFromParent(); if (A != B && B->use_empty()) @@ -891,27 +888,27 @@ public: // having to update as many def-use and use-def chains. // // Because we add to ToRemove during fusion we can't guarantee that defs - // are before uses. Change uses to undef temporarily as these should get + // are before uses. Change uses to poison temporarily as these should get // removed as well. // - // For verification, we keep track of where we changed uses to undefs in - // UndefedInsts and then check that we in fact remove them. - SmallSet<Instruction *, 16> UndefedInsts; + // For verification, we keep track of where we changed uses to poison in + // PoisonedInsts and then check that we in fact remove them. + SmallSet<Instruction *, 16> PoisonedInsts; for (auto *Inst : reverse(ToRemove)) { for (Use &U : llvm::make_early_inc_range(Inst->uses())) { - if (auto *Undefed = dyn_cast<Instruction>(U.getUser())) - UndefedInsts.insert(Undefed); - U.set(UndefValue::get(Inst->getType())); + if (auto *Poisoned = dyn_cast<Instruction>(U.getUser())) + PoisonedInsts.insert(Poisoned); + U.set(PoisonValue::get(Inst->getType())); } Inst->eraseFromParent(); - UndefedInsts.erase(Inst); + PoisonedInsts.erase(Inst); } - if (!UndefedInsts.empty()) { - // If we didn't remove all undefed instructions, it's a hard error. - dbgs() << "Undefed but present instructions:\n"; - for (auto *I : UndefedInsts) + if (!PoisonedInsts.empty()) { + // If we didn't remove all poisoned instructions, it's a hard error. + dbgs() << "Poisoned but present instructions:\n"; + for (auto *I : PoisonedInsts) dbgs() << *I << "\n"; - llvm_unreachable("Undefed but instruction not removed"); + llvm_unreachable("Poisoned but instruction not removed"); } return Changed; @@ -1670,7 +1667,7 @@ public: for (unsigned I = 0; I < NewNumVecs; ++I) { // Build a single result vector. First initialize it. - Value *ResultVector = UndefValue::get( + Value *ResultVector = PoisonValue::get( FixedVectorType::get(VectorTy->getElementType(), NewNumElts)); // Go through the old elements and insert it into the resulting vector. for (auto J : enumerate(InputMatrix.vectors())) { diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp index 73b2cd06fa23..e2de322933bc 100644 --- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp +++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp @@ -13,8 +13,6 @@ #include "llvm/Transforms/Scalar/LowerWidenableCondition.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/GuardUtils.h" -#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -24,7 +22,6 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/GuardUtils.h" using namespace llvm; @@ -50,9 +47,13 @@ static bool lowerWidenableCondition(Function &F) { using namespace llvm::PatternMatch; SmallVector<CallInst *, 8> ToLower; - for (auto &I : instructions(F)) - if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) - ToLower.push_back(cast<CallInst>(&I)); + // Traverse through the users of WCDecl. + // This is presumably cheaper than traversing all instructions in the + // function. + for (auto *U : WCDecl->users()) + if (auto *CI = dyn_cast<CallInst>(U)) + if (CI->getFunction() == &F) + ToLower.push_back(CI); if (ToLower.empty()) return false; diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index 5ffae128f5f0..a3f09a5a33c3 100644 --- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -33,13 +33,11 @@ #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Analysis/GuardUtils.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 6698db26626b..1f5bc69acecd 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -28,14 +28,12 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" @@ -45,7 +43,6 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" @@ -61,15 +58,13 @@ #include <algorithm> #include <cassert> #include <cstdint> -#include <utility> using namespace llvm; #define DEBUG_TYPE "memcpyopt" static cl::opt<bool> EnableMemCpyOptWithoutLibcalls( - "enable-memcpyopt-without-libcalls", cl::init(false), cl::Hidden, - cl::ZeroOrMore, + "enable-memcpyopt-without-libcalls", cl::Hidden, cl::desc("Enable memcpyopt even when libcalls are disabled")); STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); @@ -100,7 +95,7 @@ struct MemsetRange { Value *StartPtr; /// Alignment - The known alignment of the first store. - unsigned Alignment; + MaybeAlign Alignment; /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; @@ -182,16 +177,16 @@ public: TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); assert(!StoreSize.isScalable() && "Can't track scalable-typed stores"); addRange(OffsetFromFirst, StoreSize.getFixedSize(), SI->getPointerOperand(), - SI->getAlign().value(), SI); + SI->getAlign(), SI); } void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); - addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI); + addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlign(), MSI); } - void addRange(int64_t Start, int64_t Size, Value *Ptr, - unsigned Alignment, Instruction *Inst); + void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment, + Instruction *Inst); }; } // end anonymous namespace @@ -200,7 +195,7 @@ public: /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, - unsigned Alignment, Instruction *Inst) { + MaybeAlign Alignment, Instruction *Inst) { int64_t End = Start+Size; range_iterator I = partition_point( @@ -352,9 +347,25 @@ static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc, // Check for mod of Loc between Start and End, excluding both boundaries. // Start and End can be in different blocks. -static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc, - const MemoryUseOrDef *Start, +static bool writtenBetween(MemorySSA *MSSA, AliasAnalysis &AA, + MemoryLocation Loc, const MemoryUseOrDef *Start, const MemoryUseOrDef *End) { + if (isa<MemoryUse>(End)) { + // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes. + // Manually check read accesses between Start and End, if they are in the + // same block, for clobbers. Otherwise assume Loc is clobbered. + return Start->getBlock() != End->getBlock() || + any_of( + make_range(std::next(Start->getIterator()), End->getIterator()), + [&AA, Loc](const MemoryAccess &Acc) { + if (isa<MemoryUse>(&Acc)) + return false; + Instruction *AccInst = + cast<MemoryUseOrDef>(&Acc)->getMemoryInst(); + return isModSet(AA.getModRefInfo(AccInst, Loc)); + }); + } + // TODO: Only walk until we hit Start. MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( End->getDefiningAccess(), Loc); @@ -492,7 +503,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, StartPtr = Range.StartPtr; AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, - MaybeAlign(Range.Alignment)); + Range.Alignment); LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) dbgs() << *SI << '\n'; @@ -749,36 +760,25 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. - CallInst *C = nullptr; - if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( - MSSA->getWalker()->getClobberingMemoryAccess(LI))) { - // The load most post-dom the call. Limit to the same block for now. - // TODO: Support non-local call-slot optimization? - if (LoadClobber->getBlock() == SI->getParent()) - C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst()); - } - - if (C) { - // Check that nothing touches the dest of the "copy" between - // the call and the store. - MemoryLocation StoreLoc = MemoryLocation::get(SI); - if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C), - MSSA->getMemoryAccess(SI))) - C = nullptr; - } + auto GetCall = [&]() -> CallInst * { + // We defer this expensive clobber walk until the cheap checks + // have been done on the source inside performCallSlotOptzn. + if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( + MSSA->getWalker()->getClobberingMemoryAccess(LI))) + return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst()); + return nullptr; + }; - if (C) { - bool changed = performCallSlotOptzn( - LI, SI, SI->getPointerOperand()->stripPointerCasts(), - LI->getPointerOperand()->stripPointerCasts(), - DL.getTypeStoreSize(SI->getOperand(0)->getType()), - commonAlignment(SI->getAlign(), LI->getAlign()), C); - if (changed) { - eraseInstruction(SI); - eraseInstruction(LI); - ++NumMemCpyInstr; - return true; - } + bool changed = performCallSlotOptzn( + LI, SI, SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + DL.getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(SI->getAlign(), LI->getAlign()), GetCall); + if (changed) { + eraseInstruction(SI); + eraseInstruction(LI); + ++NumMemCpyInstr; + return true; } } } @@ -853,7 +853,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, Value *cpyDest, Value *cpySrc, TypeSize cpySize, - Align cpyAlign, CallInst *C) { + Align cpyAlign, + std::function<CallInst *()> GetC) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -872,11 +873,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, if (cpySize.isScalable()) return false; - // Lifetime marks shouldn't be operated on. - if (Function *F = C->getCalledFunction()) - if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) - return false; - // Require that src be an alloca. This simplifies the reasoning considerably. auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc); if (!srcAlloca) @@ -893,6 +889,33 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, if (cpySize < srcSize) return false; + CallInst *C = GetC(); + if (!C) + return false; + + // Lifetime marks shouldn't be operated on. + if (Function *F = C->getCalledFunction()) + if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) + return false; + + + if (C->getParent() != cpyStore->getParent()) { + LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n"); + return false; + } + + MemoryLocation DestLoc = isa<StoreInst>(cpyStore) ? + MemoryLocation::get(cpyStore) : + MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore)); + + // Check that nothing touches the dest of the copy between + // the call and the store/memcpy. + if (accessedBetween(*AA, DestLoc, MSSA->getMemoryAccess(C), + MSSA->getMemoryAccess(cpyStore))) { + LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n"); + return false; + } + // Check that accessing the first srcSize bytes of dest will not cause a // trap. Otherwise the transform is invalid since it might cause a trap // to occur earlier than it otherwise would. @@ -902,6 +925,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, return false; } + // Make sure that nothing can observe cpyDest being written early. There are // a number of cases to consider: // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of @@ -1118,7 +1142,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // then we could still perform the xform by moving M up to the first memcpy. // TODO: It would be sufficient to check the MDep source up to the memcpy // size of M, rather than MDep. - if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), + if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep), MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) return false; @@ -1215,14 +1239,14 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, } // By default, create an unaligned memset. - unsigned Align = 1; + Align Alignment = Align(1); // If Dest is aligned, and SrcSize is constant, use the minimum alignment // of the sum. - const unsigned DestAlign = - std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); + const Align DestAlign = std::max(MemSet->getDestAlign().valueOrOne(), + MemCpy->getDestAlign().valueOrOne()); if (DestAlign > 1) if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) - Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); + Alignment = commonAlignment(DestAlign, SrcSizeC->getZExtValue()); IRBuilder<> Builder(MemCpy); @@ -1241,11 +1265,11 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff); unsigned DestAS = Dest->getType()->getPointerAddressSpace(); Instruction *NewMemSet = Builder.CreateMemSet( - Builder.CreateGEP(Builder.getInt8Ty(), - Builder.CreatePointerCast(Dest, - Builder.getInt8PtrTy(DestAS)), - SrcSize), - MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); + Builder.CreateGEP( + Builder.getInt8Ty(), + Builder.CreatePointerCast(Dest, Builder.getInt8PtrTy(DestAS)), + SrcSize), + MemSet->getOperand(1), MemsetLen, Alignment); assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && "MemCpy must be a MemoryDef"); @@ -1402,7 +1426,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } MemoryUseOrDef *MA = MSSA->getMemoryAccess(M); - MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA); + // FIXME: Not using getClobberingMemoryAccess() here due to PR54682. + MemoryAccess *AnyClobber = MA->getDefiningAccess(); MemoryLocation DestLoc = MemoryLocation::getForDest(M); const MemoryAccess *DestClobber = MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc); @@ -1431,28 +1456,20 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { if (Instruction *MI = MD->getMemoryInst()) { if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) { if (auto *C = dyn_cast<CallInst>(MI)) { - // The memcpy must post-dom the call. Limit to the same block for - // now. Additionally, we need to ensure that there are no accesses - // to dest between the call and the memcpy. Accesses to src will be - // checked by performCallSlotOptzn(). - // TODO: Support non-local call-slot optimization? - if (C->getParent() == M->getParent() && - !accessedBetween(*AA, DestLoc, MD, MA)) { - // FIXME: Can we pass in either of dest/src alignment here instead - // of conservatively taking the minimum? - Align Alignment = std::min(M->getDestAlign().valueOrOne(), - M->getSourceAlign().valueOrOne()); - if (performCallSlotOptzn( - M, M, M->getDest(), M->getSource(), - TypeSize::getFixed(CopySize->getZExtValue()), Alignment, - C)) { - LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" - << " call: " << *C << "\n" - << " memcpy: " << *M << "\n"); - eraseInstruction(M); - ++NumMemCpyInstr; - return true; - } + // FIXME: Can we pass in either of dest/src alignment here instead + // of conservatively taking the minimum? + Align Alignment = std::min(M->getDestAlign().valueOrOne(), + M->getSourceAlign().valueOrOne()); + if (performCallSlotOptzn( + M, M, M->getDest(), M->getSource(), + TypeSize::getFixed(CopySize->getZExtValue()), Alignment, + [C]() -> CallInst * { return C; })) { + LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" + << " call: " << *C << "\n" + << " memcpy: " << *M << "\n"); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; } } } @@ -1557,7 +1574,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { // *b = 42; // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). - if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep), + if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep), MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB))) return false; diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index aac0deea5be3..ce01ae5b2692 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -144,31 +144,33 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { LLVM_DEBUG(dbgs() << "volatile or atomic\n"); return {}; } - Value *const Addr = LoadI->getOperand(0); + Value *Addr = LoadI->getOperand(0); if (Addr->getType()->getPointerAddressSpace() != 0) { LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n"); return {}; } - auto *const GEP = dyn_cast<GetElementPtrInst>(Addr); - if (!GEP) - return {}; - LLVM_DEBUG(dbgs() << "GEP\n"); - if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) { - LLVM_DEBUG(dbgs() << "used outside of block\n"); - return {}; - } - const auto &DL = GEP->getModule()->getDataLayout(); - if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) { + const auto &DL = LoadI->getModule()->getDataLayout(); + if (!isDereferenceablePointer(Addr, LoadI->getType(), DL)) { LLVM_DEBUG(dbgs() << "not dereferenceable\n"); // We need to make sure that we can do comparison in any order, so we // require memory to be unconditionnally dereferencable. return {}; } - APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); - if (!GEP->accumulateConstantOffset(DL, Offset)) - return {}; - return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()), - Offset); + + APInt Offset = APInt(DL.getPointerTypeSizeInBits(Addr->getType()), 0); + Value *Base = Addr; + auto *GEP = dyn_cast<GetElementPtrInst>(Addr); + if (GEP) { + LLVM_DEBUG(dbgs() << "GEP\n"); + if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) { + LLVM_DEBUG(dbgs() << "used outside of block\n"); + return {}; + } + if (!GEP->accumulateConstantOffset(DL, Offset)) + return {}; + Base = GEP->getPointerOperand(); + } + return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset); } // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the @@ -244,7 +246,7 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, auto MayClobber = [&](LoadInst *LI) { // If a potentially clobbering instruction comes before the load, // we can still safely sink the load. - return !Inst->comesBefore(LI) && + return (Inst->getParent() != LI->getParent() || !Inst->comesBefore(LI)) && isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI))); }; if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI)) @@ -270,9 +272,8 @@ void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const { } // Do the actual spliting. - for (Instruction *Inst : reverse(OtherInsts)) { - Inst->moveBefore(&*NewParent->begin()); - } + for (Instruction *Inst : reverse(OtherInsts)) + Inst->moveBefore(*NewParent, NewParent->begin()); } bool BCECmpBlock::canSplit(AliasAnalysis &AA) const { @@ -368,8 +369,11 @@ Optional<BCECmpBlock> visitCmpBlock(Value *const Val, BasicBlock *const Block, return None; BCECmpBlock::InstructionSet BlockInsts( - {Result->Lhs.GEP, Result->Rhs.GEP, Result->Lhs.LoadI, Result->Rhs.LoadI, - Result->CmpI, BranchI}); + {Result->Lhs.LoadI, Result->Rhs.LoadI, Result->CmpI, BranchI}); + if (Result->Lhs.GEP) + BlockInsts.insert(Result->Lhs.GEP); + if (Result->Rhs.GEP) + BlockInsts.insert(Result->Rhs.GEP); return BCECmpBlock(std::move(*Result), Block, BlockInsts); } @@ -604,8 +608,15 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, NextCmpBlock->getParent(), InsertBefore); IRBuilder<> Builder(BB); // Add the GEPs from the first BCECmpBlock. - Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); - Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); + Value *Lhs, *Rhs; + if (FirstCmp.Lhs().GEP) + Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); + else + Lhs = FirstCmp.Lhs().LoadI->getPointerOperand(); + if (FirstCmp.Rhs().GEP) + Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); + else + Rhs = FirstCmp.Rhs().LoadI->getPointerOperand(); Value *IsEqual = nullptr; LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> " diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 734532a6670c..6383d6ea838b 100644 --- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -76,13 +76,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/Loads.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Metadata.h" +#include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index f35c9212a6f9..876ef3c427a6 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -88,8 +88,6 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -1076,6 +1074,9 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, Value *Arg1, Value *Arg2, Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(2); + // TODO: we need to remove context instruction after Value Tracking + // can run without context instruction + const SimplifyQuery Q = SQ.getWithInstruction(I); E->setType(T); E->setOpcode(Opcode); @@ -1091,7 +1092,7 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, E->op_push_back(lookupOperandLeader(Arg1)); E->op_push_back(lookupOperandLeader(Arg2)); - Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ); + Value *V = simplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), Q); if (auto Simplified = checkExprResults(E, I, V)) { addAdditionalUsers(Simplified, I); return Simplified.Expr; @@ -1147,6 +1148,9 @@ NewGVN::ExprResult NewGVN::checkExprResults(Expression *E, Instruction *I, NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); + // TODO: we need to remove context instruction after Value Tracking + // can run without context instruction + const SimplifyQuery Q = SQ.getWithInstruction(I); bool AllConstant = setBasicExpressionInfo(I, E); @@ -1169,13 +1173,13 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { Predicate = CmpInst::getSwappedPredicate(Predicate); } E->setOpcode((CI->getOpcode() << 8) | Predicate); - // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands + // TODO: 25% of our time is spent in simplifyCmpInst with pointer operands assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() && "Wrong types on cmp instruction"); assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() && E->getOperand(1)->getType() == I->getOperand(1)->getType())); Value *V = - SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ); + simplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (isa<SelectInst>(I)) { @@ -1183,26 +1187,26 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { E->getOperand(1) == E->getOperand(2)) { assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() && E->getOperand(2)->getType() == I->getOperand(2)->getType()); - Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1), - E->getOperand(2), SQ); + Value *V = simplifySelectInst(E->getOperand(0), E->getOperand(1), + E->getOperand(2), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } } else if (I->isBinaryOp()) { Value *V = - SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ); + simplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (auto *CI = dyn_cast<CastInst>(I)) { Value *V = - SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ); + simplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) { Value *V = - SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(), + simplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(), makeArrayRef(std::next(E->op_begin()), E->op_end()), - GEPI->isInBounds(), SQ); + GEPI->isInBounds(), Q); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (AllConstant) { @@ -1453,10 +1457,12 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, if (Offset >= 0) { if (auto *C = dyn_cast<Constant>( lookupOperandLeader(DepSI->getValueOperand()))) { - LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI - << " to constant " << *C << "\n"); - return createConstantExpression( - getConstantStoreValueForLoad(C, Offset, LoadType, DL)); + if (Constant *Res = + getConstantStoreValueForLoad(C, Offset, LoadType, DL)) { + LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI + << " to constant " << *Res << "\n"); + return createConstantExpression(Res); + } } } } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) { @@ -1503,9 +1509,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) return createConstantExpression(UndefValue::get(LoadType)); - } else if (isAllocationFn(DepInst, TLI)) - if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), - TLI, LoadType)) + } else if (auto *InitVal = + getInitialValueOfAllocation(DepInst, TLI, LoadType)) return createConstantExpression(InitVal); return nullptr; @@ -3142,9 +3147,8 @@ bool NewGVN::singleReachablePHIPath( // connected component finding in this routine, and it's probably not worth // the complexity for the time being. So, we just keep a set of visited // MemoryAccess and return true when we hit a cycle. - if (Visited.count(First)) + if (!Visited.insert(First).second) return true; - Visited.insert(First); const auto *EndDef = First; for (auto *ChainDef : optimized_def_chain(First)) { @@ -3353,7 +3357,7 @@ void NewGVN::verifyStoreExpressions() const { // instruction set, propagating value numbers, marking things touched, etc, // until the set of touched instructions is completely empty. void NewGVN::iterateTouchedInstructions() { - unsigned int Iterations = 0; + uint64_t Iterations = 0; // Figure out where touchedinstructions starts int FirstInstr = TouchedInstructions.find_first(); // Nothing set, nothing to iterate, just return. diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index e0d0301c1ef6..689a2a286cb9 100644 --- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -125,6 +125,9 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, if (Call->isNoBuiltin() || Call->isStrictFP()) continue; + if (Call->isMustTailCall()) + continue; + // Skip if function either has local linkage or is not a known library // function. LibFunc LF; @@ -137,7 +140,7 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, case LibFunc_sqrt: if (TTI->haveFastSqrt(Call->getType()) && optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI, - DTU.hasValue() ? DTU.getPointer() : nullptr)) + DTU ? DTU.getPointer() : nullptr)) break; continue; default: diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index a110f7d5c241..e1cc3fc71c3e 100644 --- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -53,9 +53,9 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LegacyPassManager.h" @@ -65,6 +65,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #define DEBUG_TYPE "safepoint-placement" diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index c354fa177a60..da1737979305 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -24,7 +24,6 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -42,7 +41,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -54,7 +52,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -183,7 +180,7 @@ void ReassociatePass::BuildRankMap(Function &F, // we cannot move. This ensures that the ranks for these instructions are // all different in the block. for (Instruction &I : *BB) - if (mayBeMemoryDependent(I)) + if (mayHaveNonDefUseDependency(I)) ValueRankMap[&I] = ++BBRank; } } @@ -1076,7 +1073,7 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { BinaryOperator *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); - Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. + Shl->setOperand(0, PoisonValue::get(Shl->getType())); // Drop use of op. Mul->takeName(Shl); // Everyone now refers to the mul instruction. diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index a49b9ad3f62b..9dc64493a9ee 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -24,8 +24,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index b795ad3899bc..51e4a5773f3e 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -258,6 +258,7 @@ struct GCPtrLivenessData { // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type using DefiningValueMapTy = MapVector<Value *, Value *>; +using IsKnownBaseMapTy = MapVector<Value *, bool>; using PointerToBaseTy = MapVector<Value *, Value *>; using StatepointLiveSetTy = SetVector<Value *>; using RematerializedValueMapTy = @@ -281,19 +282,29 @@ struct PartiallyConstructedSafepointRecord { RematerializedValueMapTy RematerializedValues; }; +struct RematerizlizationCandidateRecord { + // Chain from derived pointer to base. + SmallVector<Instruction *, 3> ChainToBase; + // Original base. + Value *RootOfChain; + // Cost of chain. + InstructionCost Cost; +}; +using RematCandTy = MapVector<Value *, RematerizlizationCandidateRecord>; + } // end anonymous namespace static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) { Optional<OperandBundleUse> DeoptBundle = Call->getOperandBundle(LLVMContext::OB_deopt); - if (!DeoptBundle.hasValue()) { + if (!DeoptBundle) { assert(AllowStatepointWithNoDeoptInfo && "Found non-leaf call without deopt info!"); return None; } - return DeoptBundle.getValue().Inputs; + return DeoptBundle->Inputs; } /// Compute the live-in set for every basic block in the function @@ -385,45 +396,16 @@ static void analyzeParsePointLiveness( Result.LiveSet = LiveSet; } -// Returns true is V is a knownBaseResult. -static bool isKnownBaseResult(Value *V); - -// Returns true if V is a BaseResult that already exists in the IR, i.e. it is -// not created by the findBasePointers algorithm. -static bool isOriginalBaseResult(Value *V); +/// Returns true if V is a known base. +static bool isKnownBase(Value *V, const IsKnownBaseMapTy &KnownBases); -namespace { - -/// A single base defining value - An immediate base defining value for an -/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. -/// For instructions which have multiple pointer [vector] inputs or that -/// transition between vector and scalar types, there is no immediate base -/// defining value. The 'base defining value' for 'Def' is the transitive -/// closure of this relation stopping at the first instruction which has no -/// immediate base defining value. The b.d.v. might itself be a base pointer, -/// but it can also be an arbitrary derived pointer. -struct BaseDefiningValueResult { - /// Contains the value which is the base defining value. - Value * const BDV; +/// Caches the IsKnownBase flag for a value and asserts that it wasn't present +/// in the cache before. +static void setKnownBase(Value *V, bool IsKnownBase, + IsKnownBaseMapTy &KnownBases); - /// True if the base defining value is also known to be an actual base - /// pointer. - const bool IsKnownBase; - - BaseDefiningValueResult(Value *BDV, bool IsKnownBase) - : BDV(BDV), IsKnownBase(IsKnownBase) { -#ifndef NDEBUG - // Check consistency between new and old means of checking whether a BDV is - // a base. - bool MustBeBase = isKnownBaseResult(BDV); - assert(!MustBeBase || MustBeBase == IsKnownBase); -#endif - } -}; - -} // end anonymous namespace - -static BaseDefiningValueResult findBaseDefiningValue(Value *I); +static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases); /// Return a base defining value for the 'Index' element of the given vector /// instruction 'I'. If Index is null, returns a BDV for the entire vector @@ -434,76 +416,122 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I); /// vector returned is a BDV (and possibly a base) of the entire vector 'I'. /// If the later, the return pointer is a BDV (or possibly a base) for the /// particular element in 'I'. -static BaseDefiningValueResult -findBaseDefiningValueOfVector(Value *I) { +static Value *findBaseDefiningValueOfVector(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases) { // Each case parallels findBaseDefiningValue below, see that code for // detailed motivation. - if (isa<Argument>(I)) + auto Cached = Cache.find(I); + if (Cached != Cache.end()) + return Cached->second; + + if (isa<Argument>(I)) { // An incoming argument to the function is a base pointer - return BaseDefiningValueResult(I, true); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } - if (isa<Constant>(I)) + if (isa<Constant>(I)) { // Base of constant vector consists only of constant null pointers. // For reasoning see similar case inside 'findBaseDefiningValue' function. - return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()), - true); + auto *CAZ = ConstantAggregateZero::get(I->getType()); + Cache[I] = CAZ; + setKnownBase(CAZ, /* IsKnownBase */true, KnownBases); + return CAZ; + } - if (isa<LoadInst>(I)) - return BaseDefiningValueResult(I, true); + if (isa<LoadInst>(I)) { + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } - if (isa<InsertElementInst>(I)) + if (isa<InsertElementInst>(I)) { // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. - return BaseDefiningValueResult(I, false); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */false, KnownBases); + return I; + } - if (isa<ShuffleVectorInst>(I)) + if (isa<ShuffleVectorInst>(I)) { // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. // TODO: There a number of local optimizations which could be applied here // for particular sufflevector patterns. - return BaseDefiningValueResult(I, false); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */false, KnownBases); + return I; + } // The behavior of getelementptr instructions is the same for vector and // non-vector data types. - if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) - return findBaseDefiningValue(GEP->getPointerOperand()); + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + auto *BDV = + findBaseDefiningValue(GEP->getPointerOperand(), Cache, KnownBases); + Cache[GEP] = BDV; + return BDV; + } + + // The behavior of freeze instructions is the same for vector and + // non-vector data types. + if (auto *Freeze = dyn_cast<FreezeInst>(I)) { + auto *BDV = findBaseDefiningValue(Freeze->getOperand(0), Cache, KnownBases); + Cache[Freeze] = BDV; + return BDV; + } // If the pointer comes through a bitcast of a vector of pointers to // a vector of another type of pointer, then look through the bitcast - if (auto *BC = dyn_cast<BitCastInst>(I)) - return findBaseDefiningValue(BC->getOperand(0)); + if (auto *BC = dyn_cast<BitCastInst>(I)) { + auto *BDV = findBaseDefiningValue(BC->getOperand(0), Cache, KnownBases); + Cache[BC] = BDV; + return BDV; + } // We assume that functions in the source language only return base // pointers. This should probably be generalized via attributes to support // both source language and internal functions. - if (isa<CallInst>(I) || isa<InvokeInst>(I)) - return BaseDefiningValueResult(I, true); + if (isa<CallInst>(I) || isa<InvokeInst>(I)) { + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "unknown vector instruction - no base found for vector element"); - return BaseDefiningValueResult(I, false); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */false, KnownBases); + return I; } /// Helper function for findBasePointer - Will return a value which either a) /// defines the base pointer for the input, b) blocks the simple search /// (i.e. a PHI or Select of two derived pointers), or c) involves a change /// from pointer to vector type or back. -static BaseDefiningValueResult findBaseDefiningValue(Value *I) { +static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases) { assert(I->getType()->isPtrOrPtrVectorTy() && "Illegal to ask for the base pointer of a non-pointer type"); + auto Cached = Cache.find(I); + if (Cached != Cache.end()) + return Cached->second; if (I->getType()->isVectorTy()) - return findBaseDefiningValueOfVector(I); + return findBaseDefiningValueOfVector(I, Cache, KnownBases); - if (isa<Argument>(I)) + if (isa<Argument>(I)) { // An incoming argument to the function is a base pointer // We should have never reached here if this argument isn't an gc value - return BaseDefiningValueResult(I, true); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } if (isa<Constant>(I)) { // We assume that objects with a constant base (e.g. a global) can't move @@ -516,8 +544,10 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // "phi (const1, const2)" or "phi (const, regular gc ptr)". // See constant.ll file for relevant test cases. - return BaseDefiningValueResult( - ConstantPointerNull::get(cast<PointerType>(I->getType())), true); + auto *CPN = ConstantPointerNull::get(cast<PointerType>(I->getType())); + Cache[I] = CPN; + setKnownBase(CPN, /* IsKnownBase */true, KnownBases); + return CPN; } // inttoptrs in an integral address space are currently ill-defined. We @@ -525,8 +555,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // constant rule above and because we don't really have a better semantic // to give them. Note that the optimizer is always free to insert undefined // behavior on dynamically dead paths as well. - if (isa<IntToPtrInst>(I)) - return BaseDefiningValueResult(I, true); + if (isa<IntToPtrInst>(I)) { + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Def = CI->stripPointerCasts(); @@ -539,16 +572,31 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // not simply a pointer cast (i.e. an inttoptr). We don't know how to // handle int->ptr conversion. assert(!isa<CastInst>(Def) && "shouldn't find another cast here"); - return findBaseDefiningValue(Def); + auto *BDV = findBaseDefiningValue(Def, Cache, KnownBases); + Cache[CI] = BDV; + return BDV; } - if (isa<LoadInst>(I)) + if (isa<LoadInst>(I)) { // The value loaded is an gc base itself - return BaseDefiningValueResult(I, true); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { // The base of this GEP is the base - return findBaseDefiningValue(GEP->getPointerOperand()); + auto *BDV = + findBaseDefiningValue(GEP->getPointerOperand(), Cache, KnownBases); + Cache[GEP] = BDV; + return BDV; + } + + if (auto *Freeze = dyn_cast<FreezeInst>(I)) { + auto *BDV = findBaseDefiningValue(Freeze->getOperand(0), Cache, KnownBases); + Cache[Freeze] = BDV; + return BDV; + } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { @@ -569,24 +617,32 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { llvm_unreachable( "interaction with the gcroot mechanism is not supported"); case Intrinsic::experimental_gc_get_pointer_base: - return findBaseDefiningValue(II->getOperand(0)); + auto *BDV = findBaseDefiningValue(II->getOperand(0), Cache, KnownBases); + Cache[II] = BDV; + return BDV; } } // We assume that functions in the source language only return base // pointers. This should probably be generalized via attributes to support // both source language and internal functions. - if (isa<CallInst>(I) || isa<InvokeInst>(I)) - return BaseDefiningValueResult(I, true); + if (isa<CallInst>(I) || isa<InvokeInst>(I)) { + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } // TODO: I have absolutely no idea how to implement this part yet. It's not // necessarily hard, I just haven't really looked at it yet. assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); - if (isa<AtomicCmpXchgInst>(I)) + if (isa<AtomicCmpXchgInst>(I)) { // A CAS is effectively a atomic store and load combined under a // predicate. From the perspective of base pointers, we just treat it // like a load. - return BaseDefiningValueResult(I, true); + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " "binary ops which don't apply to pointers"); @@ -594,8 +650,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // The aggregate ops. Aggregates can either be in the heap or on the // stack, but in either case, this is simply a field load. As a result, // this is a defining definition of the base just like a load is. - if (isa<ExtractValueInst>(I)) - return BaseDefiningValueResult(I, true); + if (isa<ExtractValueInst>(I)) { + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */true, KnownBases); + return I; + } // We should never see an insert vector since that would require we be // tracing back a struct value not a pointer value. @@ -606,6 +665,8 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // substituting gc.get.pointer.base() intrinsic. bool IsKnownBase = isa<Instruction>(I) && cast<Instruction>(I)->getMetadata("is_base_value"); + setKnownBase(I, /* IsKnownBase */IsKnownBase, KnownBases); + Cache[I] = I; // An extractelement produces a base result exactly when it's input does. // We may need to insert a parallel instruction to extract the appropriate @@ -615,33 +676,38 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // Note: There a lot of obvious peephole cases here. This are deliberately // handled after the main base pointer inference algorithm to make writing // test cases to exercise that code easier. - return BaseDefiningValueResult(I, IsKnownBase); + return I; // The last two cases here don't return a base pointer. Instead, they // return a value which dynamically selects from among several base // derived pointers (each with it's own base potentially). It's the job of // the caller to resolve these. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && - "missing instruction case in findBaseDefiningValing"); - return BaseDefiningValueResult(I, IsKnownBase); + "missing instruction case in findBaseDefiningValue"); + return I; } /// Returns the base defining value for this value. -static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { - Value *&Cached = Cache[I]; - if (!Cached) { - Cached = findBaseDefiningValue(I).BDV; +static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases) { + if (Cache.find(I) == Cache.end()) { + auto *BDV = findBaseDefiningValue(I, Cache, KnownBases); + Cache[I] = BDV; LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " - << Cached->getName() << "\n"); + << Cache[I]->getName() << ", is known base = " + << KnownBases[I] << "\n"); } assert(Cache[I] != nullptr); - return Cached; + assert(KnownBases.find(Cache[I]) != KnownBases.end() && + "Cached value must be present in known bases map"); + return Cache[I]; } /// Return a base pointer for this value if known. Otherwise, return it's /// base defining value. -static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { - Value *Def = findBaseDefiningValueCached(I, Cache); +static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases) { + Value *Def = findBaseDefiningValueCached(I, Cache, KnownBases); auto Found = Cache.find(Def); if (Found != Cache.end()) { // Either a base-of relation, or a self reference. Caller must check. @@ -651,6 +717,7 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { return Def; } +#ifndef NDEBUG /// This value is a base pointer that is not generated by RS4GC, i.e. it already /// exists in the code. static bool isOriginalBaseResult(Value *V) { @@ -659,21 +726,22 @@ static bool isOriginalBaseResult(Value *V) { !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && !isa<ShuffleVectorInst>(V); } +#endif -/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, -/// is it known to be a base pointer? Or do we need to continue searching. -static bool isKnownBaseResult(Value *V) { - if (isOriginalBaseResult(V)) - return true; - if (isa<Instruction>(V) && - cast<Instruction>(V)->getMetadata("is_base_value")) { - // This is a previously inserted base phi or select. We know - // that this is a base value. - return true; - } +static bool isKnownBase(Value *V, const IsKnownBaseMapTy &KnownBases) { + auto It = KnownBases.find(V); + assert(It != KnownBases.end() && "Value not present in the map"); + return It->second; +} - // We need to keep searching - return false; +static void setKnownBase(Value *V, bool IsKnownBase, + IsKnownBaseMapTy &KnownBases) { +#ifndef NDEBUG + auto It = KnownBases.find(V); + if (It != KnownBases.end()) + assert(It->second == IsKnownBase && "Changing already present value"); +#endif + KnownBases[V] = IsKnownBase; } // Returns true if First and Second values are both scalar or both vector. @@ -801,10 +869,11 @@ static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { /// For gc objects, this is simply itself. On success, returns a value which is /// the base pointer. (This is reliable and can be used for relocation.) On /// failure, returns nullptr. -static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { - Value *Def = findBaseOrBDV(I, Cache); +static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache, + IsKnownBaseMapTy &KnownBases) { + Value *Def = findBaseOrBDV(I, Cache, KnownBases); - if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I)) + if (isKnownBase(Def, KnownBases) && areBothVectorOrScalar(Def, I)) return Def; // Here's the rough algorithm: @@ -887,8 +956,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { assert(!isOriginalBaseResult(Current) && "why did it get added?"); auto visitIncomingValue = [&](Value *InVal) { - Value *Base = findBaseOrBDV(InVal, Cache); - if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal)) + Value *Base = findBaseOrBDV(InVal, Cache, KnownBases); + if (isKnownBase(Base, KnownBases) && areBothVectorOrScalar(Base, InVal)) // Known bases won't need new instructions introduced and can be // ignored safely. However, this can only be done when InVal and Base // are both scalar or both vector. Otherwise, we need to find a @@ -924,12 +993,16 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { for (auto Pair : States) { Value *BDV = Pair.first; auto canPruneInput = [&](Value *V) { - Value *BDV = findBaseOrBDV(V, Cache); - if (V->stripPointerCasts() != BDV) + // If the input of the BDV is the BDV itself we can prune it. This is + // only possible if the BDV is a PHI node. + if (V->stripPointerCasts() == BDV) + return true; + Value *VBDV = findBaseOrBDV(V, Cache, KnownBases); + if (V->stripPointerCasts() != VBDV) return false; // The assumption is that anything not in the state list is // propagates a base pointer. - return States.count(BDV) == 0; + return States.count(VBDV) == 0; }; bool CanPrune = true; @@ -975,13 +1048,13 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Only values that do not have known bases or those that have differing // type (scalar versus vector) from a possible known base should be in the // lattice. - assert((!isKnownBaseResult(BDV) || + assert((!isKnownBase(BDV, KnownBases) || !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) && "why did it get added?"); BDVState NewState(BDV); visitBDVOperands(BDV, [&](Value *Op) { - Value *BDV = findBaseOrBDV(Op, Cache); + Value *BDV = findBaseOrBDV(Op, Cache, KnownBases); auto OpState = GetStateForBDV(BDV, Op); NewState.meet(OpState); }); @@ -1014,8 +1087,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Only values that do not have known bases or those that have differing // type (scalar versus vector) from a possible known base should be in the // lattice. - assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) && - "why did it get added?"); + assert( + (!isKnownBase(I, KnownBases) || !areBothVectorOrScalar(I, BaseValue)) && + "why did it get added?"); assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); if (!State.isBase() || !isa<VectorType>(BaseValue->getType())) @@ -1033,6 +1107,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE); BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); States[I] = BDVState(I, BDVState::Base, BaseInst); + setKnownBase(BaseInst, /* IsKnownBase */true, KnownBases); } else if (!isa<VectorType>(I->getType())) { // We need to handle cases that have a vector base but the instruction is // a scalar type (these could be phis or selects or any instruction that @@ -1055,7 +1130,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Only values that do not have known bases or those that have differing // type (scalar versus vector) from a possible known base should be in the // lattice. - assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) && + assert((!isKnownBase(I, KnownBases) || + !areBothVectorOrScalar(I, State.getBaseValue())) && "why did it get added?"); assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); @@ -1087,6 +1163,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Add metadata marking this as a base value BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); States[I] = BDVState(I, BDVState::Conflict, BaseInst); + setKnownBase(BaseInst, /* IsKnownBase */true, KnownBases); } #ifndef NDEBUG @@ -1102,7 +1179,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // assured to be able to determine an instruction which produces it's base // pointer. auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { - Value *BDV = findBaseOrBDV(Input, Cache); + Value *BDV = findBaseOrBDV(Input, Cache, KnownBases); Value *Base = nullptr; if (!States.count(BDV)) { assert(areBothVectorOrScalar(BDV, Input)); @@ -1129,7 +1206,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Only values that do not have known bases or those that have differing // type (scalar versus vector) from a possible known base should be in the // lattice. - assert((!isKnownBaseResult(BDV) || + assert((!isKnownBase(BDV, KnownBases) || !areBothVectorOrScalar(BDV, State.getBaseValue())) && "why did it get added?"); assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); @@ -1154,13 +1231,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { #ifndef NDEBUG Value *OldBase = BlockToValue[InBB]; Value *Base = getBaseForInput(InVal, nullptr); + + // We can't use `stripPointerCasts` instead of this function because + // `stripPointerCasts` doesn't handle vectors of pointers. + auto StripBitCasts = [](Value *V) -> Value * { + while (auto *BC = dyn_cast<BitCastInst>(V)) + V = BC->getOperand(0); + return V; + }; // In essence this assert states: the only way two values // incoming from the same basic block may be different is by // being different bitcasts of the same value. A cleanup // that remains TODO is changing findBaseOrBDV to return an // llvm::Value of the correct type (and still remain pure). // This will remove the need to add bitcasts. - assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() && + assert(StripBitCasts(Base) == StripBitCasts(OldBase) && "findBaseOrBDV should be pure!"); #endif } @@ -1223,8 +1308,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // Only values that do not have known bases or those that have differing // type (scalar versus vector) from a possible known base should be in the // lattice. - assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) && - "why did it get added?"); + assert( + (!isKnownBase(BDV, KnownBases) || !areBothVectorOrScalar(BDV, Base)) && + "why did it get added?"); LLVM_DEBUG( dbgs() << "Updating base value cache" @@ -1255,9 +1341,10 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, PointerToBaseTy &PointerToBase, DominatorTree *DT, - DefiningValueMapTy &DVCache) { + DefiningValueMapTy &DVCache, + IsKnownBaseMapTy &KnownBases) { for (Value *ptr : live) { - Value *base = findBasePointer(ptr, DVCache); + Value *base = findBasePointer(ptr, DVCache, KnownBases); assert(base && "failed to find base pointer"); PointerToBase[ptr] = base; assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) || @@ -1272,7 +1359,8 @@ static void findBasePointers(const StatepointLiveSetTy &live, static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, CallBase *Call, PartiallyConstructedSafepointRecord &result, - PointerToBaseTy &PointerToBase) { + PointerToBaseTy &PointerToBase, + IsKnownBaseMapTy &KnownBases) { StatepointLiveSetTy PotentiallyDerivedPointers = result.LiveSet; // We assume that all pointers passed to deopt are base pointers; as an // optimization, we can use this to avoid seperately materializing the base @@ -1286,7 +1374,8 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, PotentiallyDerivedPointers.remove(V); PointerToBase[V] = V; } - findBasePointers(PotentiallyDerivedPointers, PointerToBase, &DT, DVCache); + findBasePointers(PotentiallyDerivedPointers, PointerToBase, &DT, DVCache, + KnownBases); } /// Given an updated version of the dataflow liveness results, update the @@ -1349,23 +1438,23 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] = // Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeList legalizeCallAttributes(LLVMContext &Ctx, - AttributeList AL) { - if (AL.isEmpty()) - return AL; + AttributeList OrigAL, + AttributeList StatepointAL) { + if (OrigAL.isEmpty()) + return StatepointAL; // Remove the readonly, readnone, and statepoint function attributes. - AttrBuilder FnAttrs(Ctx, AL.getFnAttrs()); + AttrBuilder FnAttrs(Ctx, OrigAL.getFnAttrs()); for (auto Attr : FnAttrsToStrip) FnAttrs.removeAttribute(Attr); - for (Attribute A : AL.getFnAttrs()) { + for (Attribute A : OrigAL.getFnAttrs()) { if (isStatepointDirectiveAttr(A)) FnAttrs.removeAttribute(A); } // Just skip parameter and return attributes for now - return AttributeList::get(Ctx, AttributeList::FunctionIndex, - AttributeSet::get(Ctx, FnAttrs)); + return StatepointAL.addFnAttributes(Ctx, FnAttrs); } /// Helper function to place all gc relocates necessary for the given @@ -1570,8 +1659,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ assert(DeoptLowering.equals("live-through") && "Unsupported value!"); } - Value *CallTarget = Call->getCalledOperand(); - if (Function *F = dyn_cast<Function>(CallTarget)) { + FunctionCallee CallTarget(Call->getFunctionType(), Call->getCalledOperand()); + if (Function *F = dyn_cast<Function>(CallTarget.getCallee())) { auto IID = F->getIntrinsicID(); if (IID == Intrinsic::experimental_deoptimize) { // Calls to llvm.experimental.deoptimize are lowered to calls to the @@ -1589,8 +1678,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ // the same module. This is fine -- we assume the frontend knew what it // was doing when generating this kind of IR. CallTarget = F->getParent() - ->getOrInsertFunction("__llvm_deoptimize", FTy) - .getCallee(); + ->getOrInsertFunction("__llvm_deoptimize", FTy); IsDeoptimize = true; } else if (IID == Intrinsic::memcpy_element_unordered_atomic || @@ -1686,8 +1774,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ CallTarget = F->getParent() - ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy) - .getCallee(); + ->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy); } } @@ -1705,8 +1792,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ // function attributes. In case if we can handle this set of attributes - // set up function attrs directly on statepoint and return attrs later for // gc_result intrinsic. - SPCall->setAttributes( - legalizeCallAttributes(CI->getContext(), CI->getAttributes())); + SPCall->setAttributes(legalizeCallAttributes( + CI->getContext(), CI->getAttributes(), SPCall->getAttributes())); Token = cast<GCStatepointInst>(SPCall); @@ -1732,8 +1819,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ // function attributes. In case if we can handle this set of attributes - // set up function attrs directly on statepoint and return attrs later for // gc_result intrinsic. - SPInvoke->setAttributes( - legalizeCallAttributes(II->getContext(), II->getAttributes())); + SPInvoke->setAttributes(legalizeCallAttributes( + II->getContext(), II->getAttributes(), SPInvoke->getAttributes())); Token = cast<GCStatepointInst>(SPInvoke); @@ -2071,6 +2158,7 @@ static void relocationViaAlloca( assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && "we must have the same allocas with lives"); + (void) NumRematerializedValues; if (!PromotableAllocas.empty()) { // Apply mem2reg to promote alloca to SSA PromoteMemToReg(PromotableAllocas, DT); @@ -2221,27 +2309,25 @@ static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPh return true; } -// From the statepoint live set pick values that are cheaper to recompute then -// to relocate. Remove this values from the live set, rematerialize them after -// statepoint and record them in "Info" structure. Note that similar to -// relocated values we don't do any user adjustments here. -static void rematerializeLiveValues(CallBase *Call, - PartiallyConstructedSafepointRecord &Info, - PointerToBaseTy &PointerToBase, - TargetTransformInfo &TTI) { +// Find derived pointers that can be recomputed cheap enough and fill +// RematerizationCandidates with such candidates. +static void +findRematerializationCandidates(PointerToBaseTy PointerToBase, + RematCandTy &RematerizationCandidates, + TargetTransformInfo &TTI) { const unsigned int ChainLengthThreshold = 10; - // Record values we are going to delete from this statepoint live set. - // We can not di this in following loop due to iterator invalidation. - SmallVector<Value *, 32> LiveValuesToBeDeleted; + for (auto P2B : PointerToBase) { + auto *Derived = P2B.first; + auto *Base = P2B.second; + // Consider only derived pointers. + if (Derived == Base) + continue; - for (Value *LiveValue: Info.LiveSet) { - // For each live pointer find its defining chain + // For each live pointer find its defining chain. SmallVector<Instruction *, 3> ChainToBase; - assert(PointerToBase.count(LiveValue)); Value *RootOfChain = - findRematerializableChainToBasePointer(ChainToBase, - LiveValue); + findRematerializableChainToBasePointer(ChainToBase, Derived); // Nothing to do, or chain is too long if ( ChainToBase.size() == 0 || @@ -2250,9 +2336,9 @@ static void rematerializeLiveValues(CallBase *Call, // Handle the scenario where the RootOfChain is not equal to the // Base Value, but they are essentially the same phi values. - if (RootOfChain != PointerToBase[LiveValue]) { + if (RootOfChain != PointerToBase[Derived]) { PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain); - PHINode *AlternateRootPhi = dyn_cast<PHINode>(PointerToBase[LiveValue]); + PHINode *AlternateRootPhi = dyn_cast<PHINode>(PointerToBase[Derived]); if (!OrigRootPhi || !AlternateRootPhi) continue; // PHI nodes that have the same incoming values, and belonging to the same @@ -2266,33 +2352,61 @@ static void rematerializeLiveValues(CallBase *Call, // deficiency in the findBasePointer algorithm. if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi)) continue; - // Now that the phi nodes are proved to be the same, assert that - // findBasePointer's newly generated AlternateRootPhi is present in the - // liveset of the call. - assert(Info.LiveSet.count(AlternateRootPhi)); } - // Compute cost of this chain + // Compute cost of this chain. InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI); // TODO: We can also account for cases when we will be able to remove some // of the rematerialized values by later optimization passes. I.e if // we rematerialized several intersecting chains. Or if original values // don't have any uses besides this statepoint. + // Ok, there is a candidate. + RematerizlizationCandidateRecord Record; + Record.ChainToBase = ChainToBase; + Record.RootOfChain = RootOfChain; + Record.Cost = Cost; + RematerizationCandidates.insert({ Derived, Record }); + } +} + +// From the statepoint live set pick values that are cheaper to recompute then +// to relocate. Remove this values from the live set, rematerialize them after +// statepoint and record them in "Info" structure. Note that similar to +// relocated values we don't do any user adjustments here. +static void rematerializeLiveValues(CallBase *Call, + PartiallyConstructedSafepointRecord &Info, + PointerToBaseTy &PointerToBase, + RematCandTy &RematerizationCandidates, + TargetTransformInfo &TTI) { + // Record values we are going to delete from this statepoint live set. + // We can not di this in following loop due to iterator invalidation. + SmallVector<Value *, 32> LiveValuesToBeDeleted; + + for (Value *LiveValue : Info.LiveSet) { + auto It = RematerizationCandidates.find(LiveValue); + if (It == RematerizationCandidates.end()) + continue; + + RematerizlizationCandidateRecord &Record = It->second; + + InstructionCost Cost = Record.Cost; // For invokes we need to rematerialize each chain twice - for normal and // for unwind basic blocks. Model this by multiplying cost by two. - if (isa<InvokeInst>(Call)) { + if (isa<InvokeInst>(Call)) Cost *= 2; - } - // If it's too expensive - skip it + + // If it's too expensive - skip it. if (Cost >= RematerializationThreshold) continue; // Remove value from the live set LiveValuesToBeDeleted.push_back(LiveValue); - // Clone instructions and record them inside "Info" structure + // Clone instructions and record them inside "Info" structure. - // Walk backwards to visit top-most instructions first + // For each live pointer find get its defining chain. + SmallVector<Instruction *, 3> ChainToBase = Record.ChainToBase; + // Walk backwards to visit top-most instructions first. std::reverse(ChainToBase.begin(), ChainToBase.end()); // Utility function which clones all instructions from "ChainToBase" @@ -2352,7 +2466,7 @@ static void rematerializeLiveValues(CallBase *Call, Instruction *InsertBefore = Call->getNextNode(); assert(InsertBefore); Instruction *RematerializedValue = rematerializeChain( - InsertBefore, RootOfChain, PointerToBase[LiveValue]); + InsertBefore, Record.RootOfChain, PointerToBase[LiveValue]); Info.RematerializedValues[RematerializedValue] = LiveValue; } else { auto *Invoke = cast<InvokeInst>(Call); @@ -2363,9 +2477,9 @@ static void rematerializeLiveValues(CallBase *Call, &*Invoke->getUnwindDest()->getFirstInsertionPt(); Instruction *NormalRematerializedValue = rematerializeChain( - NormalInsertBefore, RootOfChain, PointerToBase[LiveValue]); + NormalInsertBefore, Record.RootOfChain, PointerToBase[LiveValue]); Instruction *UnwindRematerializedValue = rematerializeChain( - UnwindInsertBefore, RootOfChain, PointerToBase[LiveValue]); + UnwindInsertBefore, Record.RootOfChain, PointerToBase[LiveValue]); Info.RematerializedValues[NormalRematerializedValue] = LiveValue; Info.RematerializedValues[UnwindRematerializedValue] = LiveValue; @@ -2380,7 +2494,8 @@ static void rematerializeLiveValues(CallBase *Call, static bool inlineGetBaseAndOffset(Function &F, SmallVectorImpl<CallInst *> &Intrinsics, - DefiningValueMapTy &DVCache) { + DefiningValueMapTy &DVCache, + IsKnownBaseMapTy &KnownBases) { auto &Context = F.getContext(); auto &DL = F.getParent()->getDataLayout(); bool Changed = false; @@ -2389,7 +2504,8 @@ static bool inlineGetBaseAndOffset(Function &F, switch (Callsite->getIntrinsicID()) { case Intrinsic::experimental_gc_get_pointer_base: { Changed = true; - Value *Base = findBasePointer(Callsite->getOperand(0), DVCache); + Value *Base = + findBasePointer(Callsite->getOperand(0), DVCache, KnownBases); assert(!DVCache.count(Callsite)); auto *BaseBC = IRBuilder<>(Callsite).CreateBitCast( Base, Callsite->getType(), suffixed_name_or(Base, ".cast", "")); @@ -2404,7 +2520,7 @@ static bool inlineGetBaseAndOffset(Function &F, case Intrinsic::experimental_gc_get_pointer_offset: { Changed = true; Value *Derived = Callsite->getOperand(0); - Value *Base = findBasePointer(Derived, DVCache); + Value *Base = findBasePointer(Derived, DVCache, KnownBases); assert(!DVCache.count(Callsite)); unsigned AddressSpace = Derived->getType()->getPointerAddressSpace(); unsigned IntPtrSize = DL.getPointerSizeInBits(AddressSpace); @@ -2431,7 +2547,8 @@ static bool inlineGetBaseAndOffset(Function &F, static bool insertParsePoints(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, SmallVectorImpl<CallBase *> &ToUpdate, - DefiningValueMapTy &DVCache) { + DefiningValueMapTy &DVCache, + IsKnownBaseMapTy &KnownBases) { #ifndef NDEBUG // Validate the input std::set<CallBase *> Uniqued; @@ -2487,7 +2604,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // B) Find the base pointers for each live pointer for (size_t i = 0; i < Records.size(); i++) { PartiallyConstructedSafepointRecord &info = Records[i]; - findBasePointers(DT, DVCache, ToUpdate[i], info, PointerToBase); + findBasePointers(DT, DVCache, ToUpdate[i], info, PointerToBase, KnownBases); } if (PrintBasePointers) { errs() << "Base Pairs (w/o Relocation):\n"; @@ -2563,11 +2680,16 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Holders.clear(); + // Compute the cost of possible re-materialization of derived pointers. + RematCandTy RematerizationCandidates; + findRematerializationCandidates(PointerToBase, RematerizationCandidates, TTI); + // In order to reduce live set of statepoint we might choose to rematerialize // some values instead of relocating them. This is purely an optimization and // does not influence correctness. for (size_t i = 0; i < Records.size(); i++) - rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase, TTI); + rematerializeLiveValues(ToUpdate[i], Records[i], PointerToBase, + RematerizationCandidates, TTI); // We need this to safely RAUW and delete call or invoke return values that // may themselves be live over a statepoint. For details, please see usage in @@ -2930,13 +3052,18 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // inlineGetBaseAndOffset() and insertParsePoints(). DefiningValueMapTy DVCache; + // Mapping between a base values and a flag indicating whether it's a known + // base or not. + IsKnownBaseMapTy KnownBases; + if (!Intrinsics.empty()) // Inline @gc.get.pointer.base() and @gc.get.pointer.offset() before finding // live references. - MadeChange |= inlineGetBaseAndOffset(F, Intrinsics, DVCache); + MadeChange |= inlineGetBaseAndOffset(F, Intrinsics, DVCache, KnownBases); if (!ParsePointNeeded.empty()) - MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded, DVCache); + MadeChange |= + insertParsePoints(F, DT, TTI, ParsePointNeeded, DVCache, KnownBases); return MadeChange; } diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index c34da51e6dc1..2282ef636076 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -17,20 +17,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/SCCP.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" @@ -38,14 +33,13 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/InstVisitor.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" @@ -59,7 +53,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/PredicateInfo.h" +#include "llvm/Transforms/Utils/SCCPSolver.h" #include <cassert> #include <utility> #include <vector> @@ -97,6 +91,18 @@ static bool isOverdefined(const ValueLatticeElement &LV) { return !LV.isUnknownOrUndef() && !isConstant(LV); } +static bool canRemoveInstruction(Instruction *I) { + if (wouldInstructionBeTriviallyDead(I)) + return true; + + // Some instructions can be handled but are rejected above. Catch + // those cases by falling through to here. + // TODO: Mark globals as being constant earlier, so + // TODO: wouldInstructionBeTriviallyDead() knows that atomic loads + // TODO: are safe to remove. + return isa<LoadInst>(I); +} + static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { Constant *Const = nullptr; if (V->getType()->isStructTy()) { @@ -127,7 +133,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { // Calls with "clang.arc.attachedcall" implicitly use the return value and // those uses cannot be updated with a constant. CallBase *CB = dyn_cast<CallBase>(V); - if (CB && ((CB->isMustTailCall() && !CB->isSafeToRemove()) || + if (CB && ((CB->isMustTailCall() && + !canRemoveInstruction(CB)) || CB->getOperandBundle(LLVMContext::OB_clang_arc_attachedcall))) { Function *F = CB->getCalledFunction(); @@ -156,7 +163,7 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB, if (Inst.getType()->isVoidTy()) continue; if (tryToReplaceWithConstant(Solver, &Inst)) { - if (Inst.isSafeToRemove()) + if (canRemoveInstruction(&Inst)) Inst.eraseFromParent(); MadeChanges = true; @@ -170,6 +177,7 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB, continue; if (IV.getConstantRange().isAllNonNegative()) { auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst); + ZExt->takeName(&Inst); InsertedValues.insert(ZExt); Inst.replaceAllUsesWith(ZExt); Solver.removeLatticeValueFor(&Inst); @@ -182,10 +190,14 @@ static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB, return MadeChanges; } +static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, + DomTreeUpdater &DTU, + BasicBlock *&NewUnreachableBB); + // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm, // and return true if the function was modified. static bool runSCCP(Function &F, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, DomTreeUpdater &DTU) { LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); SCCPSolver Solver( DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }, @@ -213,13 +225,12 @@ static bool runSCCP(Function &F, const DataLayout &DL, // as we cannot modify the CFG of the function. SmallPtrSet<Value *, 32> InsertedValues; + SmallVector<BasicBlock *, 8> BlocksToErase; for (BasicBlock &BB : F) { if (!Solver.isBlockExecutable(&BB)) { LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB); - ++NumDeadBlocks; - NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first; - + BlocksToErase.push_back(&BB); MadeChanges = true; continue; } @@ -228,17 +239,32 @@ static bool runSCCP(Function &F, const DataLayout &DL, NumInstRemoved, NumInstReplaced); } + // Remove unreachable blocks and non-feasible edges. + for (BasicBlock *DeadBB : BlocksToErase) + NumInstRemoved += changeToUnreachable(DeadBB->getFirstNonPHI(), + /*PreserveLCSSA=*/false, &DTU); + + BasicBlock *NewUnreachableBB = nullptr; + for (BasicBlock &BB : F) + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB); + + for (BasicBlock *DeadBB : BlocksToErase) + if (!DeadBB->hasAddressTaken()) + DTU.deleteBB(DeadBB); + return MadeChanges; } PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) { const DataLayout &DL = F.getParent()->getDataLayout(); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); - if (!runSCCP(F, DL, &TLI)) + auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + if (!runSCCP(F, DL, &TLI, DTU)) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); - PA.preserveSet<CFGAnalyses>(); + PA.preserve<DominatorTreeAnalysis>(); return PA; } @@ -261,7 +287,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); - AU.setPreservesCFG(); + AU.addPreserved<DominatorTreeWrapperPass>(); } // runOnFunction - Run the Sparse Conditional Constant Propagation @@ -272,7 +298,10 @@ public: const DataLayout &DL = F.getParent()->getDataLayout(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); - return runSCCP(F, DL, TLI); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DomTreeUpdater DTU(DTWP ? &DTWP->getDomTree() : nullptr, + DomTreeUpdater::UpdateStrategy::Lazy); + return runSCCP(F, DL, TLI, DTU); } }; @@ -342,7 +371,8 @@ static void findReturnsToZap(Function &F, } static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, - DomTreeUpdater &DTU) { + DomTreeUpdater &DTU, + BasicBlock *&NewUnreachableBB) { SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors; bool HasNonFeasibleEdges = false; for (BasicBlock *Succ : successors(BB)) { @@ -362,7 +392,19 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, isa<IndirectBrInst>(TI)) && "Terminator must be a br, switch or indirectbr"); - if (FeasibleSuccessors.size() == 1) { + if (FeasibleSuccessors.size() == 0) { + // Branch on undef/poison, replace with unreachable. + SmallPtrSet<BasicBlock *, 8> SeenSuccs; + SmallVector<DominatorTree::UpdateType, 8> Updates; + for (BasicBlock *Succ : successors(BB)) { + Succ->removePredecessor(BB); + if (SeenSuccs.insert(Succ).second) + Updates.push_back({DominatorTree::Delete, BB, Succ}); + } + TI->eraseFromParent(); + new UnreachableInst(BB->getContext(), BB); + DTU.applyUpdatesPermissive(Updates); + } else if (FeasibleSuccessors.size() == 1) { // Replace with an unconditional branch to the only feasible successor. BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin(); SmallVector<DominatorTree::UpdateType, 8> Updates; @@ -385,6 +427,23 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, } else if (FeasibleSuccessors.size() > 1) { SwitchInstProfUpdateWrapper SI(*cast<SwitchInst>(TI)); SmallVector<DominatorTree::UpdateType, 8> Updates; + + // If the default destination is unfeasible it will never be taken. Replace + // it with a new block with a single Unreachable instruction. + BasicBlock *DefaultDest = SI->getDefaultDest(); + if (!FeasibleSuccessors.contains(DefaultDest)) { + if (!NewUnreachableBB) { + NewUnreachableBB = + BasicBlock::Create(DefaultDest->getContext(), "default.unreachable", + DefaultDest->getParent(), DefaultDest); + new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + } + + SI->setDefaultDest(NewUnreachableBB); + Updates.push_back({DominatorTree::Delete, BB, DefaultDest}); + Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB}); + } + for (auto CI = SI->case_begin(); CI != SI->case_end();) { if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) { ++CI; @@ -532,11 +591,13 @@ bool llvm::runIPSCCP( NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(), /*PreserveLCSSA=*/false, &DTU); + BasicBlock *NewUnreachableBB = nullptr; for (BasicBlock &BB : F) - MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU); + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB); for (BasicBlock *DeadBB : BlocksToErase) - DTU.deleteBB(DeadBB); + if (!DeadBB->hasAddressTaken()) + DTU.deleteBB(DeadBB); for (BasicBlock &BB : F) { for (Instruction &Inst : llvm::make_early_inc_range(BB)) { diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 8be8946702be..143a035749c7 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -57,11 +57,9 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" @@ -78,14 +76,12 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> -#include <chrono> #include <cstddef> #include <cstdint> #include <cstring> @@ -1016,7 +1012,7 @@ private: I.getParent()->getFirstInsertionPt() == I.getParent()->end()) return PI.setAborted(&I); - // TODO: We could use SimplifyInstruction here to fold PHINodes and + // TODO: We could use simplifyInstruction here to fold PHINodes and // SelectInsts. However, doing so requires to change the current // dead-operand-tracking mechanism. For instance, suppose neither loading // from %U nor %other traps. Then "load (select undef, %U, %other)" does not @@ -1987,13 +1983,22 @@ static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; uint64_t RelEnd = S.endOffset() - AllocBeginOffset; + Use *U = S.getUse(); + + // Lifetime intrinsics operate over the whole alloca whose sizes are usually + // larger than other load/store slices (RelEnd > Size). But lifetime are + // always promotable and should not impact other slices' promotability of the + // partition. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { + if (II->isLifetimeStartOrEnd() || II->isDroppable()) + return true; + } + // We can't reasonably handle cases where the load or store extends past // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; - Use *U = S.getUse(); - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { if (LI->isVolatile()) return false; @@ -2048,9 +2053,6 @@ static bool isIntegerWideningViableForSlice(const Slice &S, return false; if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { - if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) - return false; } else { return false; } @@ -2179,10 +2181,7 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, return V; } - SmallVector<int, 8> Mask; - Mask.reserve(NumElements); - for (unsigned i = BeginIndex; i != EndIndex; ++i) - Mask.push_back(i); + auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex)); V = IRB.CreateShuffleVector(V, Mask, Name + ".extract"); LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; @@ -2734,10 +2733,9 @@ private: Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); V = IRB.CreateMul( IRB.CreateZExt(V, SplatIntTy, "zext"), - ConstantExpr::getUDiv( - Constant::getAllOnesValue(SplatIntTy), - ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), - SplatIntTy)), + IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy), + IRB.CreateZExt(Constant::getAllOnesValue(V->getType()), + SplatIntTy)), "isplat"); return V; } @@ -2887,7 +2885,7 @@ private: assert((IsDest && II.getRawDest() == OldPtr) || (!IsDest && II.getRawSource() == OldPtr)); - MaybeAlign SliceAlign = getSliceAlign(); + Align SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -3481,19 +3479,13 @@ private: Type *Ty = GEPI.getSourceElementType(); Value *True = Sel->getTrueValue(); - Value *NTrue = - IsInBounds - ? IRB.CreateInBoundsGEP(Ty, True, Index, - True->getName() + ".sroa.gep") - : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep"); + Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep", + IsInBounds); Value *False = Sel->getFalseValue(); - Value *NFalse = - IsInBounds - ? IRB.CreateInBoundsGEP(Ty, False, Index, - False->getName() + ".sroa.gep") - : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep"); + Value *NFalse = IRB.CreateGEP(Ty, False, Index, + False->getName() + ".sroa.gep", IsInBounds); Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, Sel->getName() + ".sroa.sel"); @@ -3547,10 +3539,8 @@ private: IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); Type *Ty = GEPI.getSourceElementType(); - NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index, - In->getName() + ".sroa.gep") - : IRB.CreateGEP(Ty, In, Index, - In->getName() + ".sroa.gep"); + NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep", + IsInBounds); } NewPN->addIncoming(NewVal, B); } @@ -3972,16 +3962,15 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { for (LoadInst *LI : Loads) { SplitLoads.clear(); - IntegerType *Ty = cast<IntegerType>(LI->getType()); - assert(Ty->getBitWidth() % 8 == 0); - uint64_t LoadSize = Ty->getBitWidth() / 8; - assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); - auto &Offsets = SplitOffsetsMap[LI]; - assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && - "Slice size should always match load size exactly!"); + unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset(); + assert(LI->getType()->getIntegerBitWidth() % 8 == 0 && + "Load must have type size equal to store size"); + assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize && + "Load must be >= slice size"); + uint64_t BaseOffset = Offsets.S->beginOffset(); - assert(BaseOffset + LoadSize > BaseOffset && + assert(BaseOffset + SliceSize > BaseOffset && "Cannot represent alloca access size using 64-bit integers!"); Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); @@ -3992,7 +3981,7 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); int Idx = 0, Size = Offsets.Splits.size(); for (;;) { - auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8); auto AS = LI->getPointerAddressSpace(); auto *PartPtrTy = PartTy->getPointerTo(AS); LoadInst *PLoad = IRB.CreateAlignedLoad( @@ -4025,7 +4014,7 @@ bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Setup the next partition. PartOffset = Offsets.Splits[Idx]; ++Idx; - PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset; } // Now that we have the split loads, do the slow walk over all uses of the diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index f9650efc051f..008ddfc72740 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -16,16 +16,13 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" #include "llvm/Analysis/BasicAliasAnalysis.h" -#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/Scalarizer.h" -#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" using namespace llvm; @@ -76,7 +73,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopRerollLegacyPassPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnrollAndJamPass(Registry); - initializeLoopUnswitchPass(Registry); initializeWarnMissedTransformationsLegacyPass(Registry); initializeLoopVersioningLICMLegacyPassPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); @@ -104,6 +100,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeTLSVariableHoistLegacyPassPass(Registry); initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReduceLegacyPassPass(Registry); @@ -214,10 +211,6 @@ void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollAndJamPass()); } -void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLoopUnswitchPass()); -} - void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLowerAtomicPass()); } diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 29cea42e4a00..e2976ace3a4a 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1,5 +1,5 @@ //===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===// -// instrinsics +// intrinsics // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -24,11 +24,9 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -36,7 +34,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <algorithm> #include <cassert> using namespace llvm; @@ -876,7 +873,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, for (BasicBlock &BB : llvm::make_early_inc_range(F)) { bool ModifiedDTOnIteration = false; MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration, TTI, DL, - DTU.hasValue() ? DTU.getPointer() : nullptr); + DTU ? DTU.getPointer() : nullptr); // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 3606c8a4b073..08f4b2173da2 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -39,8 +39,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> @@ -52,7 +50,7 @@ using namespace llvm; #define DEBUG_TYPE "scalarizer" -static cl::opt<bool> ScalarizeVariableInsertExtract( +static cl::opt<bool> ClScalarizeVariableInsertExtract( "scalarize-variable-insert-extract", cl::init(true), cl::Hidden, cl::desc("Allow the scalarizer pass to scalarize " "insertelement/extractelement with variable index")); @@ -60,9 +58,9 @@ static cl::opt<bool> ScalarizeVariableInsertExtract( // This is disabled by default because having separate loads and stores // makes it more likely that the -combiner-alias-analysis limits will be // reached. -static cl::opt<bool> - ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden, - cl::desc("Allow the scalarizer pass to scalarize loads and store")); +static cl::opt<bool> ClScalarizeLoadStore( + "scalarize-load-store", cl::init(false), cl::Hidden, + cl::desc("Allow the scalarizer pass to scalarize loads and store")); namespace { @@ -96,7 +94,7 @@ public: // Scatter V into Size components. If new instructions are needed, // insert them before BBI in BB. If Cache is nonnull, use it to cache // the results. - Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, Type *PtrElemTy, ValueVector *cachePtr = nullptr); // Return component I, creating a new Value for it if necessary. @@ -109,8 +107,8 @@ private: BasicBlock *BB; BasicBlock::iterator BBI; Value *V; + Type *PtrElemTy; ValueVector *CachePtr; - PointerType *PtrTy; ValueVector Tmp; unsigned Size; }; @@ -188,10 +186,23 @@ struct VectorLayout { uint64_t ElemSize = 0; }; +template <typename T> +T getWithDefaultOverride(const cl::opt<T> &ClOption, + const llvm::Optional<T> &DefaultOverride) { + return ClOption.getNumOccurrences() ? ClOption + : DefaultOverride.value_or(ClOption); +} + class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: - ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT) - : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) { + ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT, + ScalarizerPassOptions Options) + : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT), + ScalarizeVariableInsertExtract( + getWithDefaultOverride(ClScalarizeVariableInsertExtract, + Options.ScalarizeVariableInsertExtract)), + ScalarizeLoadStore(getWithDefaultOverride(ClScalarizeLoadStore, + Options.ScalarizeLoadStore)) { } bool visit(Function &F); @@ -216,8 +227,9 @@ public: bool visitCallInst(CallInst &ICI); private: - Scatterer scatter(Instruction *Point, Value *V); + Scatterer scatter(Instruction *Point, Value *V, Type *PtrElemTy = nullptr); void gather(Instruction *Op, const ValueVector &CV); + void replaceUses(Instruction *Op, Value *CV); bool canTransferMetadata(unsigned Kind); void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV); Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment, @@ -231,12 +243,16 @@ private: ScatterMap Scattered; GatherList Gathered; + bool Scalarized; SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs; unsigned ParallelLoopAccessMDKind; DominatorTree *DT; + + const bool ScalarizeVariableInsertExtract; + const bool ScalarizeLoadStore; }; class ScalarizerLegacyPass : public FunctionPass { @@ -265,12 +281,14 @@ INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, - ValueVector *cachePtr) - : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) { + Type *PtrElemTy, ValueVector *cachePtr) + : BB(bb), BBI(bbi), V(v), PtrElemTy(PtrElemTy), CachePtr(cachePtr) { Type *Ty = V->getType(); - PtrTy = dyn_cast<PointerType>(Ty); - if (PtrTy) - Ty = PtrTy->getPointerElementType(); + if (Ty->isPointerTy()) { + assert(cast<PointerType>(Ty)->isOpaqueOrPointeeTypeMatches(PtrElemTy) && + "Pointer element type mismatch"); + Ty = PtrElemTy; + } Size = cast<FixedVectorType>(Ty)->getNumElements(); if (!CachePtr) Tmp.resize(Size, nullptr); @@ -287,15 +305,15 @@ Value *Scatterer::operator[](unsigned I) { if (CV[I]) return CV[I]; IRBuilder<> Builder(BB, BBI); - if (PtrTy) { - Type *ElTy = - cast<VectorType>(PtrTy->getPointerElementType())->getElementType(); + if (PtrElemTy) { + Type *VectorElemTy = cast<VectorType>(PtrElemTy)->getElementType(); if (!CV[0]) { - Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace()); + Type *NewPtrTy = PointerType::get( + VectorElemTy, V->getType()->getPointerAddressSpace()); CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0"); } if (I != 0) - CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I, + CV[I] = Builder.CreateConstGEP1_32(VectorElemTy, CV[0], I, V->getName() + ".i" + Twine(I)); } else { // Search through a chain of InsertElementInsts looking for element I. @@ -334,7 +352,7 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) { unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, ScalarizerPassOptions()); return Impl.visit(F); } @@ -345,6 +363,8 @@ FunctionPass *llvm::createScalarizerPass() { bool ScalarizerVisitor::visit(Function &F) { assert(Gathered.empty() && Scattered.empty()); + Scalarized = false; + // To ensure we replace gathered components correctly we need to do an ordered // traversal of the basic blocks in the function. ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock()); @@ -362,13 +382,14 @@ bool ScalarizerVisitor::visit(Function &F) { // Return a scattered form of V that can be accessed by Point. V must be a // vector or a pointer to a vector. -Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { +Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V, + Type *PtrElemTy) { if (Argument *VArg = dyn_cast<Argument>(V)) { // Put the scattered form of arguments in the entry block, // so that it can be used everywhere. Function *F = VArg->getParent(); BasicBlock *BB = &F->getEntryBlock(); - return Scatterer(BB, BB->begin(), V, &Scattered[V]); + return Scatterer(BB, BB->begin(), V, PtrElemTy, &Scattered[V]); } if (Instruction *VOp = dyn_cast<Instruction>(V)) { // When scalarizing PHI nodes we might try to examine/rewrite InsertElement @@ -379,17 +400,17 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { // need to analyse them further. if (!DT->isReachableFromEntry(VOp->getParent())) return Scatterer(Point->getParent(), Point->getIterator(), - UndefValue::get(V->getType())); + PoisonValue::get(V->getType()), PtrElemTy); // Put the scattered form of an instruction directly after the // instruction, skipping over PHI nodes and debug intrinsics. BasicBlock *BB = VOp->getParent(); return Scatterer( BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V, - &Scattered[V]); + PtrElemTy, &Scattered[V]); } // In the fallback case, just put the scattered before Point and // keep the result local to Point. - return Scatterer(Point->getParent(), Point->getIterator(), V); + return Scatterer(Point->getParent(), Point->getIterator(), V, PtrElemTy); } // Replace Op with the gathered form of the components in CV. Defer the @@ -419,6 +440,15 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) { Gathered.push_back(GatherList::value_type(Op, &SV)); } +// Replace Op with CV and collect Op has a potentially dead instruction. +void ScalarizerVisitor::replaceUses(Instruction *Op, Value *CV) { + if (CV != Op) { + Op->replaceAllUsesWith(CV); + PotentiallyDeadInstrs.emplace_back(Op); + Scalarized = true; + } +} + // Return true if it is safe to transfer the given metadata tag from // vector to scalar instructions. bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) { @@ -558,9 +588,11 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { if (OpI->getType()->isVectorTy()) { Scattered[I] = scatter(&CI, OpI); assert(Scattered[I].size() == NumElems && "mismatched call operands"); + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) + Tys.push_back(OpI->getType()->getScalarType()); } else { ScalarOperands[I] = OpI; - if (hasVectorInstrinsicOverloadedScalarOpd(ID, I)) + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) Tys.push_back(OpI->getType()); } } @@ -576,7 +608,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { ScalarCallOps.clear(); for (unsigned J = 0; J != NumArgs; ++J) { - if (hasVectorInstrinsicScalarOpd(ID, J)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) ScalarCallOps.push_back(ScalarOperands[J]); else ScalarCallOps.push_back(Scattered[J][Elem]); @@ -809,7 +841,7 @@ bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) { Value *Res = Op0[CI->getValue().getZExtValue()]; - gather(&EEI, {Res}); + replaceUses(&EEI, Res); return true; } @@ -825,7 +857,7 @@ bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { Res = Builder.CreateSelect(ShouldExtract, Elt, Res, EEI.getName() + ".upto" + Twine(I)); } - gather(&EEI, {Res}); + replaceUses(&EEI, Res); return true; } @@ -891,7 +923,7 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements(); IRBuilder<> Builder(&LI); - Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); + Scatterer Ptr = scatter(&LI, LI.getPointerOperand(), LI.getType()); ValueVector Res; Res.resize(NumElems); @@ -917,7 +949,7 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements(); IRBuilder<> Builder(&SI); - Scatterer VPtr = scatter(&SI, SI.getPointerOperand()); + Scatterer VPtr = scatter(&SI, SI.getPointerOperand(), FullValue->getType()); Scatterer VVal = scatter(&SI, FullValue); ValueVector Stores; @@ -940,7 +972,7 @@ bool ScalarizerVisitor::visitCallInst(CallInst &CI) { bool ScalarizerVisitor::finish() { // The presence of data in Gathered or Scattered indicates changes // made to the Function. - if (Gathered.empty() && Scattered.empty()) + if (Gathered.empty() && Scattered.empty() && !Scalarized) return false; for (const auto &GMI : Gathered) { Instruction *Op = GMI.first; @@ -971,6 +1003,7 @@ bool ScalarizerVisitor::finish() { } Gathered.clear(); Scattered.clear(); + Scalarized = false; RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs); @@ -982,7 +1015,7 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT, Options); bool Changed = Impl.visit(F); PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index d23925042b0a..7da5a78772ad 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -189,7 +189,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index a27da047bfd3..0535608244cc 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -19,7 +19,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GuardUtils.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -28,6 +27,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -49,7 +49,9 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" @@ -81,7 +83,6 @@ static cl::opt<bool> EnableNonTrivialUnswitch( static cl::opt<int> UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, - cl::ZeroOrMore, cl::desc("The cost threshold for unswitching a loop.")); static cl::opt<bool> EnableUnswitchCostMultiplier( @@ -110,17 +111,27 @@ static cl::opt<unsigned> "partial unswitching analysis"), cl::init(100), cl::Hidden); static cl::opt<bool> FreezeLoopUnswitchCond( - "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden, + "freeze-loop-unswitch-cond", cl::init(true), cl::Hidden, cl::desc("If enabled, the freeze instruction will be added to condition " "of loop unswitch to prevent miscompilation.")); +// Helper to skip (select x, true, false), which matches both a logical AND and +// OR and can confuse code that tries to determine if \p Cond is either a +// logical AND or OR but not both. +static Value *skipTrivialSelect(Value *Cond) { + Value *CondNext; + while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero()))) + Cond = CondNext; + return Cond; +} + /// Collect all of the loop invariant input values transitively used by the /// homogeneous instruction graph from a given root. /// /// This essentially walks from a root recursively through loop variant operands -/// which have the exact same opcode and finds all inputs which are loop -/// invariant. For some operations these can be re-associated and unswitched out -/// of the loop entirely. +/// which have perform the same logical operation (AND or OR) and finds all +/// inputs which are loop invariant. For some operations these can be +/// re-associated and unswitched out of the loop entirely. static TinyPtrVector<Value *> collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, LoopInfo &LI) { @@ -150,7 +161,7 @@ collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root, } // If not an instruction with the same opcode, nothing we can do. - Instruction *OpI = dyn_cast<Instruction>(OpV); + Instruction *OpI = dyn_cast<Instruction>(skipTrivialSelect(OpV)); if (OpI && ((IsRootAnd && match(OpI, m_LogicalAnd())) || (IsRootOr && match(OpI, m_LogicalOr())))) { @@ -202,13 +213,19 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, /// branch on a single value. static void buildPartialUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, - BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) { + BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, + Instruction *I, AssumptionCache *AC, DominatorTree &DT) { IRBuilder<> IRB(&BB); - Value *Cond = Direction ? IRB.CreateOr(Invariants) : - IRB.CreateAnd(Invariants); - if (InsertFreeze) - Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr"); + SmallVector<Value *> FrozenInvariants; + for (Value *Inv : Invariants) { + if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(Inv, AC, I, &DT)) + Inv = IRB.CreateFreeze(Inv, Inv->getName() + ".fr"); + FrozenInvariants.push_back(Inv); + } + + Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants) + : IRB.CreateAnd(FrozenInvariants); IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc); } @@ -442,11 +459,12 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // some input conditions to the branch. bool FullUnswitch = false; - if (L.isLoopInvariant(BI.getCondition())) { - Invariants.push_back(BI.getCondition()); + Value *Cond = skipTrivialSelect(BI.getCondition()); + if (L.isLoopInvariant(Cond)) { + Invariants.push_back(Cond); FullUnswitch = true; } else { - if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition())) + if (auto *CondInst = dyn_cast<Instruction>(Cond)) Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI); if (Invariants.empty()) { LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n"); @@ -480,8 +498,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // is a graph of `or` operations, or the exit block is along the false edge // and the condition is a graph of `and` operations. if (!FullUnswitch) { - if (ExitDirection ? !match(BI.getCondition(), m_LogicalOr()) - : !match(BI.getCondition(), m_LogicalAnd())) { + if (ExitDirection ? !match(Cond, m_LogicalOr()) + : !match(Cond, m_LogicalAnd())) { LLVM_DEBUG(dbgs() << " Branch condition is in improper form for " "non-full unswitch!\n"); return false; @@ -546,6 +564,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // its successors. OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(), BI); + BI.setCondition(Cond); if (MSSAU) { // Temporarily clone the terminator, to make MSSA update cheaper by // separating "insert edge" updates from "remove edge" ones. @@ -561,15 +580,16 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // Only unswitching a subset of inputs to the condition, so we will need to // build a new branch that merges the invariant inputs. if (ExitDirection) - assert(match(BI.getCondition(), m_LogicalOr()) && + assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) && "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the " "condition!"); else - assert(match(BI.getCondition(), m_LogicalAnd()) && + assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) && "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" " condition!"); - buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection, - *UnswitchedBB, *NewPH, false); + buildPartialUnswitchConditionalBranch( + *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, + FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); } // Update the dominator tree with the added edge. @@ -1019,7 +1039,8 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, // Don't bother trying to unswitch past an unconditional branch or a branch // with a constant value. These should be removed by simplifycfg prior to // running this pass. - if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + if (!BI->isConditional() || + isa<Constant>(skipTrivialSelect(BI->getCondition()))) return Changed; // Found a trivial condition candidate: non-foldable conditional branch. If @@ -1663,7 +1684,7 @@ deleteDeadBlocksFromLoop(Loop &L, // uses in other blocks. for (auto &I : *BB) if (!I.use_empty()) - I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.replaceAllUsesWith(PoisonValue::get(I.getType())); BB->dropAllReferences(); } @@ -2042,12 +2063,13 @@ static void unswitchNontrivialInvariants( "Can only unswitch switches and conditional branch!"); bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty(); bool FullUnswitch = - SI || (BI->getCondition() == Invariants[0] && !PartiallyInvariant); + SI || (skipTrivialSelect(BI->getCondition()) == Invariants[0] && + !PartiallyInvariant); if (FullUnswitch) assert(Invariants.size() == 1 && "Cannot have other invariants with full unswitching!"); else - assert(isa<Instruction>(BI->getCondition()) && + assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) && "Partial unswitching requires an instruction as the condition!"); if (MSSAU && VerifyMemorySSA) @@ -2062,14 +2084,14 @@ static void unswitchNontrivialInvariants( bool Direction = true; int ClonedSucc = 0; if (!FullUnswitch) { - Value *Cond = BI->getCondition(); + Value *Cond = skipTrivialSelect(BI->getCondition()); (void)Cond; assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) || PartiallyInvariant) && "Only `or`, `and`, an `select`, partially invariant instructions " "can combine invariants being unswitched."); - if (!match(BI->getCondition(), m_LogicalOr())) { - if (match(BI->getCondition(), m_LogicalAnd()) || + if (!match(Cond, m_LogicalOr())) { + if (match(Cond, m_LogicalAnd()) || (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) { Direction = false; ClonedSucc = 1; @@ -2209,11 +2231,12 @@ static void unswitchNontrivialInvariants( BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); + Value *Cond = skipTrivialSelect(BI->getCondition()); if (InsertFreeze) { - auto Cond = BI->getCondition(); if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT)) - BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI)); + Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI); } + BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { assert(SI && "Must either be a branch or switch!"); @@ -2311,9 +2334,11 @@ static void unswitchNontrivialInvariants( if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); - else - buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, - *ClonedPH, *LoopPH, InsertFreeze); + else { + buildPartialUnswitchConditionalBranch( + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, + FreezeLoopUnswitchCond, BI, &AC, DT); + } DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); if (MSSAU) { @@ -2745,22 +2770,16 @@ static bool unswitchBestCondition( BI->getSuccessor(0) == BI->getSuccessor(1)) continue; - // If BI's condition is 'select _, true, false', simplify it to confuse - // matchers - Value *Cond = BI->getCondition(), *CondNext; - while (match(Cond, m_Select(m_Value(CondNext), m_One(), m_Zero()))) - Cond = CondNext; - BI->setCondition(Cond); - + Value *Cond = skipTrivialSelect(BI->getCondition()); if (isa<Constant>(Cond)) continue; - if (L.isLoopInvariant(BI->getCondition())) { - UnswitchCandidates.push_back({BI, {BI->getCondition()}}); + if (L.isLoopInvariant(Cond)) { + UnswitchCandidates.push_back({BI, {Cond}}); continue; } - Instruction &CondI = *cast<Instruction>(BI->getCondition()); + Instruction &CondI = *cast<Instruction>(Cond); if (match(&CondI, m_CombineOr(m_LogicalAnd(), m_LogicalOr()))) { TinyPtrVector<Value *> Invariants = collectHomogenousInstGraphLoopInvariants(L, CondI, LI); @@ -2785,8 +2804,7 @@ static bool unswitchBestCondition( PartialIVInfo = *Info; PartialIVCondBranch = L.getHeader()->getTerminator(); TinyPtrVector<Value *> ValsToDuplicate; - for (auto *Inst : Info->InstToDuplicate) - ValsToDuplicate.push_back(Inst); + llvm::append_range(ValsToDuplicate, Info->InstToDuplicate); UnswitchCandidates.push_back( {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); } @@ -2902,10 +2920,11 @@ static bool unswitchBestCondition( // its cost. if (!FullUnswitch) { auto &BI = cast<BranchInst>(TI); - if (match(BI.getCondition(), m_LogicalAnd())) { + Value *Cond = skipTrivialSelect(BI.getCondition()); + if (match(Cond, m_LogicalAnd())) { if (SuccBB == BI.getSuccessor(1)) continue; - } else if (match(BI.getCondition(), m_LogicalOr())) { + } else if (match(Cond, m_LogicalOr())) { if (SuccBB == BI.getSuccessor(0)) continue; } else if ((PartialIVInfo.KnownValue->isOneValue() && @@ -2947,8 +2966,9 @@ static bool unswitchBestCondition( ArrayRef<Value *> Invariants = TerminatorAndInvariants.second; BranchInst *BI = dyn_cast<BranchInst>(&TI); InstructionCost CandidateCost = ComputeUnswitchedCost( - TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 && - Invariants[0] == BI->getCondition())); + TI, /*FullUnswitch*/ !BI || + (Invariants.size() == 1 && + Invariants[0] == skipTrivialSelect(BI->getCondition()))); // Calculate cost multiplier which is a tool to limit potentially // exponential behavior of loop-unswitch. if (EnableUnswitchCostMultiplier) { @@ -3131,8 +3151,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - UnswitchCB, &AR.SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, + UnswitchCB, &AR.SE, MSSAU ? MSSAU.getPointer() : nullptr, DestroyLoopCB)) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index ee17da1875e5..fb2d812a186d 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -31,19 +31,16 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" #include <utility> @@ -59,6 +56,11 @@ static cl::opt<bool> UserKeepLoops( "keep-loops", cl::Hidden, cl::init(true), cl::desc("Preserve canonical loop structure (default = true)")); +static cl::opt<bool> UserSwitchRangeToICmp( + "switch-range-to-icmp", cl::Hidden, cl::init(false), + cl::desc( + "Convert switches into an integer range comparison (default = false)")); + static cl::opt<bool> UserSwitchToLookup( "switch-to-lookup", cl::Hidden, cl::init(false), cl::desc("Convert switches to lookup tables (default = false)")); @@ -311,6 +313,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) { Options.BonusInstThreshold = UserBonusInstThreshold; if (UserForwardSwitchCond.getNumOccurrences()) Options.ForwardSwitchCondToPhi = UserForwardSwitchCond; + if (UserSwitchRangeToICmp.getNumOccurrences()) + Options.ConvertSwitchRangeToICmp = UserSwitchRangeToICmp; if (UserSwitchToLookup.getNumOccurrences()) Options.ConvertSwitchToLookupTable = UserSwitchToLookup; if (UserKeepLoops.getNumOccurrences()) @@ -337,6 +341,8 @@ void SimplifyCFGPass::printPipeline( OS << "<"; OS << "bonus-inst-threshold=" << Options.BonusInstThreshold << ";"; OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;"; + OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-") + << "switch-range-to-icmp;"; OS << (Options.ConvertSwitchToLookupTable ? "" : "no-") << "switch-to-lookup;"; OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;"; diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp index 8600aacdb056..e8fde53005f0 100644 --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -15,12 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -48,7 +43,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, } if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() || - Inst->mayThrow()) + Inst->mayThrow() || !Inst->willReturn()) return false; if (auto *Call = dyn_cast<CallBase>(Inst)) { diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index 06169a7834f6..9ac4608134c2 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -63,10 +63,10 @@ #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" @@ -275,7 +275,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( }); } - // Usially debug label instrinsic corresponds to label in LLVM IR. In these + // Usially debug label intrinsic corresponds to label in LLVM IR. In these // cases we should not move it here. // TODO: Possible special processing needed to detect it is related to a // hoisted instruction. @@ -301,7 +301,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( if (TotalSpeculationCost > SpecExecMaxSpeculationCost) return false; // too much to hoist } else { - // Debug info instrinsics should not be counted for threshold. + // Debug info intrinsics should not be counted for threshold. if (!isa<DbgInfoIntrinsic>(I)) NotHoistedInstCount++; if (NotHoistedInstCount > SpecExecMaxNotHoisted) diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index b47378808216..70df0cec0dca 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -68,7 +68,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -683,24 +682,16 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis( unsigned AS = Basis.Ins->getType()->getPointerAddressSpace(); Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS); Reduced = Builder.CreateBitCast(Basis.Ins, CharTy); - if (InBounds) - Reduced = - Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump); - else - Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump); + Reduced = + Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump, "", InBounds); Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType()); } else { // C = gep Basis, Bump // Canonicalize bump to pointer size. Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy); - if (InBounds) - Reduced = Builder.CreateInBoundsGEP( - cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(), - Basis.Ins, Bump); - else - Reduced = Builder.CreateGEP( - cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(), - Basis.Ins, Bump); + Reduced = Builder.CreateGEP( + cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(), + Basis.Ins, Bump, "", InBounds); } break; } diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index b3a445368537..f6525ad7de9b 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -18,10 +18,8 @@ #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" -#include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -33,7 +31,6 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" -#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" @@ -41,7 +38,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" @@ -72,6 +68,11 @@ static cl::opt<bool> cl::desc("Allow relaxed uniform region checks"), cl::init(true)); +static cl::opt<unsigned> + ReorderNodeSize("structurizecfg-node-reorder-size", + cl::desc("Limit region size for reordering nodes"), + cl::init(100), cl::Hidden); + // Definition of the complex types used in this pass. using BBValuePair = std::pair<BasicBlock *, Value *>; @@ -266,6 +267,8 @@ class StructurizeCFG { void orderNodes(); + void reorderNodes(); + void analyzeLoops(RegionNode *N); Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); @@ -424,6 +427,57 @@ void StructurizeCFG::orderNodes() { } } +/// Change the node ordering to decrease the range of live values, especially +/// the values that capture the control flow path for branches. We do this +/// by moving blocks with a single predecessor and successor to appear after +/// predecessor. The motivation is to move some loop exit blocks into a loop. +/// In cases where a loop has a large number of exit blocks, this reduces the +/// amount of values needed across the loop boundary. +void StructurizeCFG::reorderNodes() { + SmallVector<RegionNode *, 8> NewOrder; + DenseMap<BasicBlock *, unsigned> MoveTo; + BitVector Moved(Order.size()); + + // The benefits of reordering nodes occurs for large regions. + if (Order.size() <= ReorderNodeSize) + return; + + // The algorithm works with two passes over Order. The first pass identifies + // the blocks to move and the position to move them to. The second pass + // creates the new order based upon this information. We move blocks with + // a single predecessor and successor. If there are multiple candidates then + // maintain the original order. + BBSet Seen; + for (int I = Order.size() - 1; I >= 0; --I) { + auto *BB = Order[I]->getEntry(); + Seen.insert(BB); + auto *Pred = BB->getSinglePredecessor(); + auto *Succ = BB->getSingleSuccessor(); + // Consider only those basic blocks that have a predecessor in Order and a + // successor that exits the region. The region may contain subregions that + // have been structurized and are not included in Order. + if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() && + !MoveTo.count(Pred)) { + MoveTo[Pred] = I; + Moved.set(I); + } + } + + // If no blocks have been moved then the original order is good. + if (!Moved.count()) + return; + + for (size_t I = 0, E = Order.size(); I < E; ++I) { + auto *BB = Order[I]->getEntry(); + if (MoveTo.count(BB)) + NewOrder.push_back(Order[MoveTo[BB]]); + if (!Moved[I]) + NewOrder.push_back(Order[I]); + } + + Order.assign(NewOrder); +} + /// Determine the end of the loops void StructurizeCFG::analyzeLoops(RegionNode *N) { if (N->isSubRegion()) { @@ -685,7 +739,7 @@ void StructurizeCFG::simplifyAffectedPhis() { Q.DT = DT; for (WeakVH VH : AffectedPhis) { if (auto Phi = dyn_cast_or_null<PHINode>(VH)) { - if (auto NewValue = SimplifyInstruction(Phi, Q)) { + if (auto NewValue = simplifyInstruction(Phi, Q)) { Phi->replaceAllUsesWith(NewValue); Phi->eraseFromParent(); Changed = true; @@ -1085,12 +1139,13 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) { ParentRegion = R; orderNodes(); + reorderNodes(); collectInfos(); createFlow(); insertConditions(false); insertConditions(true); - simplifyConditions(); setPhiValues(); + simplifyConditions(); simplifyAffectedPhis(); rebuildSSA(); diff --git a/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp new file mode 100644 index 000000000000..16b3483f9687 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp @@ -0,0 +1,306 @@ +//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass identifies/eliminate Redundant TLS Loads if related option is set. +// The example: Please refer to the comment at the head of TLSVariableHoist.h. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/TLSVariableHoist.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <utility> + +using namespace llvm; +using namespace tlshoist; + +#define DEBUG_TYPE "tlshoist" + +static cl::opt<bool> TLSLoadHoist( + "tls-load-hoist", cl::init(false), cl::Hidden, + cl::desc("hoist the TLS loads in PIC model to eliminate redundant " + "TLS address calculation.")); + +namespace { + +/// The TLS Variable hoist pass. +class TLSVariableHoistLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + TLSVariableHoistLegacyPass() : FunctionPass(ID) { + initializeTLSVariableHoistLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "TLS Variable Hoist"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + } + +private: + TLSVariableHoistPass Impl; +}; + +} // end anonymous namespace + +char TLSVariableHoistLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist", + "TLS Variable Hoist", false, false) + +FunctionPass *llvm::createTLSVariableHoistPass() { + return new TLSVariableHoistLegacyPass(); +} + +/// Perform the TLS Variable Hoist optimization for the given function. +bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n"); + LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); + + bool MadeChange = + Impl.runImpl(Fn, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<LoopInfoWrapperPass>().getLoopInfo()); + + if (MadeChange) { + LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: " + << Fn.getName() << '\n'); + LLVM_DEBUG(dbgs() << Fn); + } + LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n"); + + return MadeChange; +} + +void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + auto *GV = dyn_cast<GlobalVariable>(Inst->getOperand(Idx)); + if (!GV || !GV->isThreadLocal()) + continue; + + // Add Candidate to TLSCandMap (GV --> Candidate). + TLSCandMap[GV].addUser(Inst, Idx); + } +} + +void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) { + // First, quickly check if there is TLS Variable. + Module *M = Fn.getParent(); + + bool HasTLS = llvm::any_of( + M->globals(), [](GlobalVariable &GV) { return GV.isThreadLocal(); }); + + // If non, directly return. + if (!HasTLS) + return; + + TLSCandMap.clear(); + + // Then, collect TLS Variable info. + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; + + for (Instruction &Inst : BB) + collectTLSCandidate(&Inst); + } +} + +static bool oneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) { + if (Cand.Users.size() != 1) + return false; + + BasicBlock *BB = Cand.Users[0].Inst->getParent(); + if (LI->getLoopFor(BB)) + return false; + + return true; +} + +Instruction *TLSVariableHoistPass::getNearestLoopDomInst(BasicBlock *BB, + Loop *L) { + assert(L && "Unexcepted Loop status!"); + + // Get the outermost loop. + while (Loop *Parent = L->getParentLoop()) + L = Parent; + + BasicBlock *PreHeader = L->getLoopPreheader(); + + // There is unique predecessor outside the loop. + if (PreHeader) + return PreHeader->getTerminator(); + + BasicBlock *Header = L->getHeader(); + BasicBlock *Dom = Header; + for (BasicBlock *PredBB : predecessors(Header)) + Dom = DT->findNearestCommonDominator(Dom, PredBB); + + assert(Dom && "Not find dominator BB!"); + Instruction *Term = Dom->getTerminator(); + + return Term; +} + +Instruction *TLSVariableHoistPass::getDomInst(Instruction *I1, + Instruction *I2) { + if (!I1) + return I2; + if (DT->dominates(I1, I2)) + return I1; + if (DT->dominates(I2, I1)) + return I2; + + // If there is no dominance relation, use common dominator. + BasicBlock *DomBB = + DT->findNearestCommonDominator(I1->getParent(), I2->getParent()); + + Instruction *Dom = DomBB->getTerminator(); + assert(Dom && "Common dominator not found!"); + + return Dom; +} + +BasicBlock::iterator TLSVariableHoistPass::findInsertPos(Function &Fn, + GlobalVariable *GV, + BasicBlock *&PosBB) { + tlshoist::TLSCandidate &Cand = TLSCandMap[GV]; + + // We should hoist the TLS use out of loop, so choose its nearest instruction + // which dominate the loop and the outside loops (if exist). + Instruction *LastPos = nullptr; + for (auto &User : Cand.Users) { + BasicBlock *BB = User.Inst->getParent(); + Instruction *Pos = User.Inst; + if (Loop *L = LI->getLoopFor(BB)) { + Pos = getNearestLoopDomInst(BB, L); + assert(Pos && "Not find insert position out of loop!"); + } + Pos = getDomInst(LastPos, Pos); + LastPos = Pos; + } + + assert(LastPos && "Unexpected insert position!"); + BasicBlock *Parent = LastPos->getParent(); + PosBB = Parent; + return LastPos->getIterator(); +} + +// Generate a bitcast (no type change) to replace the uses of TLS Candidate. +Instruction *TLSVariableHoistPass::genBitCastInst(Function &Fn, + GlobalVariable *GV) { + BasicBlock *PosBB = &Fn.getEntryBlock(); + BasicBlock::iterator Iter = findInsertPos(Fn, GV, PosBB); + Type *Ty = GV->getType(); + auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast"); + PosBB->getInstList().insert(Iter, CastInst); + return CastInst; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidate(Function &Fn, + GlobalVariable *GV) { + + tlshoist::TLSCandidate &Cand = TLSCandMap[GV]; + + // If only used 1 time and not in loops, we no need to replace it. + if (oneUseOutsideLoop(Cand, LI)) + return false; + + // Generate a bitcast (no type change) + auto *CastInst = genBitCastInst(Fn, GV); + + // to replace the uses of TLS Candidate + for (auto &User : Cand.Users) + User.Inst->setOperand(User.OpndIdx, CastInst); + + return true; +} + +bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) { + if (TLSCandMap.empty()) + return false; + + bool Replaced = false; + for (auto &GV2Cand : TLSCandMap) { + GlobalVariable *GV = GV2Cand.first; + Replaced |= tryReplaceTLSCandidate(Fn, GV); + } + + return Replaced; +} + +/// Optimize expensive TLS variables in the given function. +bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT, + LoopInfo &LI) { + if (Fn.hasOptNone()) + return false; + + if (!TLSLoadHoist && !Fn.getAttributes().hasFnAttr("tls-load-hoist")) + return false; + + this->LI = &LI; + this->DT = &DT; + assert(this->LI && this->DT && "Unexcepted requirement!"); + + // Collect all TLS variable candidates. + collectTLSCandidates(Fn); + + bool MadeChange = tryReplaceTLSCandidates(Fn); + + return MadeChange; +} + +PreservedAnalyses TLSVariableHoistPass::run(Function &F, + FunctionAnalysisManager &AM) { + + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + + if (!runImpl(F, DT, LI)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 3bcf92e28a21..27c04177e894 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -53,11 +53,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -76,14 +73,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "tailcallelim" @@ -248,10 +243,10 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { isa<PseudoProbeInst>(&I)) continue; - // Special-case operand bundle "clang.arc.attachedcall". + // Special-case operand bundles "clang.arc.attachedcall" and "ptrauth". bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundlesOtherThan( - LLVMContext::OB_clang_arc_attachedcall); + {LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_ptrauth}); if (!IsNoTail && CI->doesNotAccessMemory()) { // A call to a readnone function whose arguments are all things computed @@ -531,7 +526,7 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { } // If the function doen't return void, create the RetPN and RetKnownPN PHI - // nodes to track our return value. We initialize RetPN with undef and + // nodes to track our return value. We initialize RetPN with poison and // RetKnownPN with false since we can't know our return value at function // entry. Type *RetType = F.getReturnType(); @@ -540,7 +535,7 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos); RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos); - RetPN->addIncoming(UndefValue::get(RetType), NewEntry); + RetPN->addIncoming(PoisonValue::get(RetType), NewEntry); RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry); } @@ -734,7 +729,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { // call. for (PHINode *PN : ArgumentPHIs) { // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { + if (Value *PNV = simplifyInstruction(PN, F.getParent()->getDataLayout())) { PN->replaceAllUsesWith(PNV); PN->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp index 80a7d3a43ad6..8367e61c1a47 100644 --- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -61,7 +61,7 @@ static void warnAboutLeftoverTransformations(Loop *L, << "loop not vectorized: the optimizer was unable to perform the " "requested transformation; the transformation might be disabled " "or specified as part of an unsupported transformation ordering"); - else if (InterleaveCount.getValueOr(0) != 1) + else if (InterleaveCount.value_or(0) != 1) ORE->emit( DiagnosticInfoOptimizationFailure(DEBUG_TYPE, "FailedRequestedInterleaving", |
