diff options
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
| -rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 349 | 
1 files changed, 238 insertions, 111 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 625a75d6cca9..cf3d16f05dbb 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -221,7 +221,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L,    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))      if (!AR->getStart()->isZero()) {        DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); -      DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), +      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),                                        AR->getStepRecurrence(SE),                                        AR->getLoop()),                       L, Good, Bad, SE, DT); @@ -262,11 +262,15 @@ void Formula::InitialMatch(const SCEV *S, Loop *L,    SmallVector<const SCEV *, 4> Bad;    DoInitialMatch(S, L, Good, Bad, SE, DT);    if (!Good.empty()) { -    BaseRegs.push_back(SE.getAddExpr(Good)); +    const SCEV *Sum = SE.getAddExpr(Good); +    if (!Sum->isZero()) +      BaseRegs.push_back(Sum);      AM.HasBaseReg = true;    }    if (!Bad.empty()) { -    BaseRegs.push_back(SE.getAddExpr(Bad)); +    const SCEV *Sum = SE.getAddExpr(Bad); +    if (!Sum->isZero()) +      BaseRegs.push_back(Sum);      AM.HasBaseReg = true;    }  } @@ -375,7 +379,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,                                  bool IgnoreSignificantBits = false) {    // Handle the trivial case, which works for any SCEV type.    if (LHS == RHS) -    return SE.getIntegerSCEV(1, LHS->getType()); +    return SE.getConstant(LHS->getType(), 1);    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some    // folding. @@ -450,7 +454,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,  static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {      if (C->getValue()->getValue().getMinSignedBits() <= 64) { -      S = SE.getIntegerSCEV(0, C->getType()); +      S = SE.getConstant(C->getType(), 0);        return C->getValue()->getSExtValue();      }    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -473,7 +477,7 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {  static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {    if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {      if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { -      S = SE.getIntegerSCEV(0, GV->getType()); +      S = SE.getConstant(GV->getType(), 0);        return GV;      }    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -781,10 +785,10 @@ struct LSRFixup {    /// will be replaced.    Value *OperandValToReplace; -  /// PostIncLoop - If this user is to use the post-incremented value of an +  /// PostIncLoops - If this user is to use the post-incremented value of an    /// induction variable, this variable is non-null and holds the loop    /// associated with the induction variable. -  const Loop *PostIncLoop; +  PostIncLoopSet PostIncLoops;    /// LUIdx - The index of the LSRUse describing the expression which    /// this fixup needs, minus an offset (below). @@ -795,6 +799,8 @@ struct LSRFixup {    /// offsets, for example in an unrolled loop.    int64_t Offset; +  bool isUseFullyOutsideLoop(const Loop *L) const; +    LSRFixup();    void print(raw_ostream &OS) const; @@ -804,9 +810,24 @@ struct LSRFixup {  }  LSRFixup::LSRFixup() -  : UserInst(0), OperandValToReplace(0), PostIncLoop(0), +  : UserInst(0), OperandValToReplace(0),      LUIdx(~size_t(0)), Offset(0) {} +/// isUseFullyOutsideLoop - Test whether this fixup always uses its +/// value outside of the given loop. +bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { +  // PHI nodes use their value in their incoming blocks. +  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { +    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) +      if (PN->getIncomingValue(i) == OperandValToReplace && +          L->contains(PN->getIncomingBlock(i))) +        return false; +    return true; +  } + +  return !L->contains(UserInst); +} +  void LSRFixup::print(raw_ostream &OS) const {    OS << "UserInst=";    // Store is common and interesting enough to be worth special-casing. @@ -821,9 +842,10 @@ void LSRFixup::print(raw_ostream &OS) const {    OS << ", OperandValToReplace=";    WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); -  if (PostIncLoop) { +  for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), +       E = PostIncLoops.end(); I != E; ++I) {      OS << ", PostIncLoop="; -    WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false); +    WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);    }    if (LUIdx != ~size_t(0)) @@ -1135,6 +1157,7 @@ class LSRInstance {    IVUsers &IU;    ScalarEvolution &SE;    DominatorTree &DT; +  LoopInfo &LI;    const TargetLowering *const TLI;    Loop *const L;    bool Changed; @@ -1214,6 +1237,13 @@ public:                      DenseSet<const SCEV *> &VisitedRegs) const;    void Solve(SmallVectorImpl<const Formula *> &Solution) const; +  BasicBlock::iterator +    HoistInsertPosition(BasicBlock::iterator IP, +                        const SmallVectorImpl<Instruction *> &Inputs) const; +  BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP, +                                                     const LSRFixup &LF, +                                                     const LSRUse &LU) const; +    Value *Expand(const LSRFixup &LF,                  const Formula &F,                  BasicBlock::iterator IP, @@ -1427,16 +1457,30 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {    const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);    if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))      return Cond; -  const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType()); +  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);    // Add one to the backedge-taken count to get the trip count.    const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One); - -  // Check for a max calculation that matches the pattern. -  if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount)) +  if (IterationCount != SE.getSCEV(Sel)) return Cond; + +  // Check for a max calculation that matches the pattern. There's no check +  // for ICMP_ULE here because the comparison would be with zero, which +  // isn't interesting. +  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; +  const SCEVNAryExpr *Max = 0; +  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { +    Pred = ICmpInst::ICMP_SLE; +    Max = S; +  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { +    Pred = ICmpInst::ICMP_SLT; +    Max = S; +  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { +    Pred = ICmpInst::ICMP_ULT; +    Max = U; +  } else { +    // No match; bail.      return Cond; -  const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount); -  if (Max != SE.getSCEV(Sel)) return Cond; +  }    // To handle a max with more than two operands, this optimization would    // require additional checking and setup. @@ -1445,7 +1489,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {    const SCEV *MaxLHS = Max->getOperand(0);    const SCEV *MaxRHS = Max->getOperand(1); -  if (!MaxLHS || MaxLHS != One) return Cond; + +  // ScalarEvolution canonicalizes constants to the left. For < and >, look +  // for a comparison with 1. For <= and >=, a comparison with zero. +  if (!MaxLHS || +      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) +    return Cond; +    // Check the relevant induction variable for conformance to    // the pattern.    const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); @@ -1461,16 +1511,29 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {    // Check the right operand of the select, and remember it, as it will    // be used in the new comparison instruction.    Value *NewRHS = 0; -  if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) +  if (ICmpInst::isTrueWhenEqual(Pred)) { +    // Look for n+1, and grab n. +    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) +      if (isa<ConstantInt>(BO->getOperand(1)) && +          cast<ConstantInt>(BO->getOperand(1))->isOne() && +          SE.getSCEV(BO->getOperand(0)) == MaxRHS) +        NewRHS = BO->getOperand(0); +    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) +      if (isa<ConstantInt>(BO->getOperand(1)) && +          cast<ConstantInt>(BO->getOperand(1))->isOne() && +          SE.getSCEV(BO->getOperand(0)) == MaxRHS) +        NewRHS = BO->getOperand(0); +    if (!NewRHS) +      return Cond; +  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)      NewRHS = Sel->getOperand(1);    else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)      NewRHS = Sel->getOperand(2); -  if (!NewRHS) return Cond; +  else +    llvm_unreachable("Max doesn't match expected pattern!");    // Determine the new comparison opcode. It may be signed or unsigned,    // and the original comparison may be either equality or inequality. -  CmpInst::Predicate Pred = -    isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;    if (Cond->getPredicate() == CmpInst::ICMP_EQ)      Pred = CmpInst::getInversePredicate(Pred); @@ -1545,8 +1608,9 @@ LSRInstance::OptimizeLoopTermCond() {              !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {            // Conservatively assume there may be reuse if the quotient of their            // strides could be a legal scale. -          const SCEV *A = CondUse->getStride(); -          const SCEV *B = UI->getStride(); +          const SCEV *A = IU.getStride(*CondUse, L); +          const SCEV *B = IU.getStride(*UI, L); +          if (!A || !B) continue;            if (SE.getTypeSizeInBits(A->getType()) !=                SE.getTypeSizeInBits(B->getType())) {              if (SE.getTypeSizeInBits(A->getType()) > @@ -1598,8 +1662,7 @@ LSRInstance::OptimizeLoopTermCond() {          ExitingBlock->getInstList().insert(TermBr, Cond);          // Clone the IVUse, as the old use still exists! -        CondUse = &IU.AddUser(CondUse->getStride(), CondUse->getOffset(), -                              Cond, CondUse->getOperandValToReplace()); +        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());          TermBr->replaceUsesOfWith(OldCond, Cond);        }      } @@ -1607,9 +1670,7 @@ LSRInstance::OptimizeLoopTermCond() {      // If we get to here, we know that we can transform the setcc instruction to      // use the post-incremented version of the IV, allowing us to coalesce the      // live ranges for the IV correctly. -    CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), -                                       CondUse->getStride())); -    CondUse->setIsUseOfPostIncrementedValue(true); +    CondUse->transformToPostInc(L);      Changed = true;      PostIncs.insert(Cond); @@ -1717,19 +1778,24 @@ void LSRInstance::CollectInterestingTypesAndFactors() {    SmallSetVector<const SCEV *, 4> Strides;    // Collect interesting types and strides. +  SmallVector<const SCEV *, 4> Worklist;    for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { -    const SCEV *Stride = UI->getStride(); +    const SCEV *Expr = IU.getExpr(*UI);      // Collect interesting types. -    Types.insert(SE.getEffectiveSCEVType(Stride->getType())); - -    // Add the stride for this loop. -    Strides.insert(Stride); - -    // Add strides for other mentioned loops. -    for (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(UI->getOffset()); -         AR; AR = dyn_cast<SCEVAddRecExpr>(AR->getStart())) -      Strides.insert(AR->getStepRecurrence(SE)); +    Types.insert(SE.getEffectiveSCEVType(Expr->getType())); + +    // Add strides for mentioned loops. +    Worklist.push_back(Expr); +    do { +      const SCEV *S = Worklist.pop_back_val(); +      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { +        Strides.insert(AR->getStepRecurrence(SE)); +        Worklist.push_back(AR->getStart()); +      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { +        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end()); +      } +    } while (!Worklist.empty());    }    // Compute interesting factors from the set of interesting strides. @@ -1776,8 +1842,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {      LSRFixup &LF = getNewFixup();      LF.UserInst = UI->getUser();      LF.OperandValToReplace = UI->getOperandValToReplace(); -    if (UI->isUseOfPostIncrementedValue()) -      LF.PostIncLoop = L; +    LF.PostIncLoops = UI->getPostIncLoops();      LSRUse::KindType Kind = LSRUse::Basic;      const Type *AccessTy = 0; @@ -1786,7 +1851,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {        AccessTy = getAccessType(LF.UserInst);      } -    const SCEV *S = IU.getCanonicalExpr(*UI); +    const SCEV *S = IU.getExpr(*UI);      // Equality (== and !=) ICmps are special. We can rewrite (i == N) as      // (N - i == 0), and this allows (N - i) to be the expression that we work @@ -1824,7 +1889,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {      LF.LUIdx = P.first;      LF.Offset = P.second;      LSRUse &LU = Uses[LF.LUIdx]; -    LU.AllFixupsOutsideLoop &= !L->contains(LF.UserInst); +    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);      // If this is the first use of this LSRUse, give it a formula.      if (LU.Formulae.empty()) { @@ -1918,9 +1983,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {            continue;          // Ignore uses which are part of other SCEV expressions, to avoid          // analyzing them multiple times. -        if (SE.isSCEVable(UserInst->getType()) && -            !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst)))) -          continue; +        if (SE.isSCEVable(UserInst->getType())) { +          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); +          // If the user is a no-op, look through to its uses. +          if (!isa<SCEVUnknown>(UserS)) +            continue; +          if (UserS == U) { +            Worklist.push_back( +              SE.getUnknown(const_cast<Instruction *>(UserInst))); +            continue; +          } +        }          // Ignore icmp instructions which are already being analyzed.          if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {            unsigned OtherIdx = !UI.getOperandNo(); @@ -1936,7 +2009,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {          LF.LUIdx = P.first;          LF.Offset = P.second;          LSRUse &LU = Uses[LF.LUIdx]; -        LU.AllFixupsOutsideLoop &= L->contains(LF.UserInst); +        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);          InsertSupplementalFormula(U, LU, LF.LUIdx);          CountRegisters(LU.Formulae.back(), Uses.size() - 1);          break; @@ -1959,7 +2032,7 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {      // Split a non-zero base out of an addrec.      if (!AR->getStart()->isZero()) { -      CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), +      CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),                                         AR->getStepRecurrence(SE),                                         AR->getLoop()), C, Ops, SE);        CollectSubexprs(AR->getStart(), C, Ops, SE); @@ -2020,8 +2093,11 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,                             LU.Kind, LU.AccessTy, TLI, SE))          continue; +      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); +      if (InnerSum->isZero()) +        continue;        Formula F = Base; -      F.BaseRegs[i] = SE.getAddExpr(InnerAddOps); +      F.BaseRegs[i] = InnerSum;        F.BaseRegs.push_back(*J);        if (InsertFormula(LU, LUIdx, F))          // If that formula hadn't been seen before, recurse to find more like @@ -2102,7 +2178,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,        F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;        if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,                       LU.Kind, LU.AccessTy, TLI)) { -        F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType())); +        F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I));          (void)InsertFormula(LU, LUIdx, F);        } @@ -2165,7 +2241,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,      // Compensate for the use having MinOffset built into it.      F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; -    const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); +    const SCEV *FactorS = SE.getConstant(IntTy, Factor);      // Check that multiplying with each base register doesn't overflow.      for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { @@ -2227,7 +2303,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,      for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)        if (const SCEVAddRecExpr *AR =              dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { -        const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); +        const SCEV *FactorS = SE.getConstant(IntTy, Factor);          if (FactorS->isZero())            continue;          // Divide out the factor, ignoring high bits, since we'll be @@ -2426,7 +2502,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {            if (C->getValue()->getValue().isNegative() !=                  (NewF.AM.BaseOffs < 0) &&                (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) -                .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) +                .ule(abs64(NewF.AM.BaseOffs)))              continue;          // OK, looks good. @@ -2454,7 +2530,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {                if (C->getValue()->getValue().isNegative() !=                      (NewF.AM.BaseOffs < 0) &&                    C->getValue()->getValue().abs() -                    .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) +                    .ule(abs64(NewF.AM.BaseOffs)))                  goto skip_formula;            // Ok, looks good. @@ -2776,37 +2852,33 @@ static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {    return Node->getBlock();  } -Value *LSRInstance::Expand(const LSRFixup &LF, -                           const Formula &F, -                           BasicBlock::iterator IP, -                           SCEVExpander &Rewriter, -                           SmallVectorImpl<WeakVH> &DeadInsts) const { -  const LSRUse &LU = Uses[LF.LUIdx]; - -  // Then, collect some instructions which we will remain dominated by when -  // expanding the replacement. These must be dominated by any operands that -  // will be required in the expansion. -  SmallVector<Instruction *, 4> Inputs; -  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) -    Inputs.push_back(I); -  if (LU.Kind == LSRUse::ICmpZero) -    if (Instruction *I = -          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) -      Inputs.push_back(I); -  if (LF.PostIncLoop) { -    if (!L->contains(LF.UserInst)) -      Inputs.push_back(L->getLoopLatch()->getTerminator()); -    else -      Inputs.push_back(IVIncInsertPos); -  } - -  // Then, climb up the immediate dominator tree as far as we can go while -  // still being dominated by the input positions. +/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up +/// the dominator tree far as we can go while still being dominated by the +/// input positions. This helps canonicalize the insert position, which +/// encourages sharing. +BasicBlock::iterator +LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, +                                 const SmallVectorImpl<Instruction *> &Inputs) +                                                                         const {    for (;;) { +    const Loop *IPLoop = LI.getLoopFor(IP->getParent()); +    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + +    BasicBlock *IDom; +    for (BasicBlock *Rung = IP->getParent(); ; Rung = IDom) { +      IDom = getImmediateDominator(Rung, DT); +      if (!IDom) return IP; + +      // Don't climb into a loop though. +      const Loop *IDomLoop = LI.getLoopFor(IDom); +      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; +      if (IDomDepth <= IPLoopDepth && +          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) +        break; +    } +      bool AllDominate = true;      Instruction *BetterPos = 0; -    BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT); -    if (!IDom) break;      Instruction *Tentative = IDom->getTerminator();      for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),           E = Inputs.end(); I != E; ++I) { @@ -2815,6 +2887,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF,          AllDominate = false;          break;        } +      // Attempt to find an insert position in the middle of the block, +      // instead of at the end, so that it can be used for other expansions.        if (IDom == Inst->getParent() &&            (!BetterPos || DT.dominates(BetterPos, Inst)))          BetterPos = next(BasicBlock::iterator(Inst)); @@ -2826,12 +2900,77 @@ Value *LSRInstance::Expand(const LSRFixup &LF,      else        IP = Tentative;    } + +  return IP; +} + +/// AdjustInsertPositionForExpand - Determine an input position which will be +/// dominated by the operands and which will dominate the result. +BasicBlock::iterator +LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, +                                           const LSRFixup &LF, +                                           const LSRUse &LU) const { +  // Collect some instructions which must be dominated by the +  // expanding replacement. These must be dominated by any operands that +  // will be required in the expansion. +  SmallVector<Instruction *, 4> Inputs; +  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) +    Inputs.push_back(I); +  if (LU.Kind == LSRUse::ICmpZero) +    if (Instruction *I = +          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) +      Inputs.push_back(I); +  if (LF.PostIncLoops.count(L)) { +    if (LF.isUseFullyOutsideLoop(L)) +      Inputs.push_back(L->getLoopLatch()->getTerminator()); +    else +      Inputs.push_back(IVIncInsertPos); +  } +  // The expansion must also be dominated by the increment positions of any +  // loops it for which it is using post-inc mode. +  for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), +       E = LF.PostIncLoops.end(); I != E; ++I) { +    const Loop *PIL = *I; +    if (PIL == L) continue; + +    // Be dominated by the loop exit. +    SmallVector<BasicBlock *, 4> ExitingBlocks; +    PIL->getExitingBlocks(ExitingBlocks); +    if (!ExitingBlocks.empty()) { +      BasicBlock *BB = ExitingBlocks[0]; +      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) +        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); +      Inputs.push_back(BB->getTerminator()); +    } +  } + +  // Then, climb up the immediate dominator tree as far as we can go while +  // still being dominated by the input positions. +  IP = HoistInsertPosition(IP, Inputs); + +  // Don't insert instructions before PHI nodes.    while (isa<PHINode>(IP)) ++IP; + +  // Ignore debug intrinsics.    while (isa<DbgInfoIntrinsic>(IP)) ++IP; +  return IP; +} + +Value *LSRInstance::Expand(const LSRFixup &LF, +                           const Formula &F, +                           BasicBlock::iterator IP, +                           SCEVExpander &Rewriter, +                           SmallVectorImpl<WeakVH> &DeadInsts) const { +  const LSRUse &LU = Uses[LF.LUIdx]; + +  // Determine an input position which will be dominated by the operands and +  // which will dominate the result. +  IP = AdjustInsertPositionForExpand(IP, LF, LU); +    // Inform the Rewriter if we have a post-increment use, so that it can    // perform an advantageous expansion. -  Rewriter.setPostInc(LF.PostIncLoop); +  Rewriter.setPostInc(LF.PostIncLoops);    // This is the type that the user actually needs.    const Type *OpTy = LF.OperandValToReplace->getType(); @@ -2855,24 +2994,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,      const SCEV *Reg = *I;      assert(!Reg->isZero() && "Zero allocated in a base register!"); -    // If we're expanding for a post-inc user for the add-rec's loop, make the -    // post-inc adjustment. -    const SCEV *Start = Reg; -    while (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Start)) { -      if (AR->getLoop() == LF.PostIncLoop) { -        Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE)); -        // If the user is inside the loop, insert the code after the increment -        // so that it is dominated by its operand. If the original insert point -        // was already dominated by the increment, keep it, because there may -        // be loop-variant operands that need to be respected also. -        if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP)) { -          IP = IVIncInsertPos; -          while (isa<DbgInfoIntrinsic>(IP)) ++IP; -        } -        break; -      } -      Start = AR->getStart(); -    } +    // If we're expanding for a post-inc user, make the post-inc adjustment. +    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); +    Reg = TransformForPostIncUse(Denormalize, Reg, +                                 LF.UserInst, LF.OperandValToReplace, +                                 Loops, SE, DT);      Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));    } @@ -2889,11 +3015,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    if (F.AM.Scale != 0) {      const SCEV *ScaledS = F.ScaledReg; -    // If we're expanding for a post-inc user for the add-rec's loop, make the -    // post-inc adjustment. -    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS)) -      if (AR->getLoop() == LF.PostIncLoop) -        ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE)); +    // If we're expanding for a post-inc user, make the post-inc adjustment. +    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); +    ScaledS = TransformForPostIncUse(Denormalize, ScaledS, +                                     LF.UserInst, LF.OperandValToReplace, +                                     Loops, SE, DT);      if (LU.Kind == LSRUse::ICmpZero) {        // An interesting way of "folding" with an icmp is to use a negated @@ -2907,8 +3033,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,        // which is expected to be matched as part of the address.        ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));        ScaledS = SE.getMulExpr(ScaledS, -                              SE.getIntegerSCEV(F.AM.Scale, -                                                ScaledS->getType())); +                              SE.getConstant(ScaledS->getType(), F.AM.Scale));        Ops.push_back(ScaledS);        // Flush the operand list to suppress SCEVExpander hoisting. @@ -2949,12 +3074,12 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    // Emit instructions summing all the operands.    const SCEV *FullS = Ops.empty() ? -                      SE.getIntegerSCEV(0, IntTy) : +                      SE.getConstant(IntTy, 0) :                        SE.getAddExpr(Ops);    Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);    // We're done expanding now, so reset the rewriter. -  Rewriter.setPostInc(0); +  Rewriter.clearPostInc();    // An ICmpZero Formula represents an ICmp which we're handling as a    // comparison against zero. Now that we've expanded an expression for that @@ -3118,6 +3243,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)    : IU(P->getAnalysis<IVUsers>()),      SE(P->getAnalysis<ScalarEvolution>()),      DT(P->getAnalysis<DominatorTree>()), +    LI(P->getAnalysis<LoopInfo>()),      TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {    // If LoopSimplify form is not available, stay out of trouble. @@ -3274,9 +3400,10 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {    // We split critical edges, so we change the CFG.  However, we do update    // many analyses if they are around.    AU.addPreservedID(LoopSimplifyID); -  AU.addPreserved<LoopInfo>();    AU.addPreserved("domfrontier"); +  AU.addRequired<LoopInfo>(); +  AU.addPreserved<LoopInfo>();    AU.addRequiredID(LoopSimplifyID);    AU.addRequired<DominatorTree>();    AU.addPreserved<DominatorTree>();  | 
