diff options
Diffstat (limited to 'lib/Transforms')
35 files changed, 1167 insertions, 740 deletions
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 4bc64ab698ff..087a8aa2c624 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -72,10 +72,6 @@ static cl::opt<bool>  RunLoopRerolling("reroll-loops", cl::Hidden,                   cl::desc("Run the loop rerolling pass")); -static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false), -                                    cl::Hidden, -                                    cl::desc("Run the load combining pass")); -  static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,                                 cl::desc("Run the NewGVN pass")); @@ -174,7 +170,6 @@ PassManagerBuilder::PassManagerBuilder() {      SLPVectorize = RunSLPVectorization;      LoopVectorize = RunLoopVectorization;      RerollLoops = RunLoopRerolling; -    LoadCombine = RunLoadCombine;      NewGVN = RunNewGVN;      DisableGVNLoadPRE = false;      VerifyInput = false; @@ -296,6 +291,8 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {      InstrProfOptions Options;      if (!PGOInstrGen.empty())        Options.InstrProfileOutput = PGOInstrGen; +    Options.DoCounterPromotion = true; +    MPM.add(createLoopRotatePass());      MPM.add(createInstrProfilingLegacyPass(Options));    }    if (!PGOInstrUse.empty()) @@ -407,9 +404,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(      }    } -  if (LoadCombine) -    MPM.add(createLoadCombinePass()); -    MPM.add(createAggressiveDCEPass());         // Delete dead instructions    MPM.add(createCFGSimplificationPass()); // Merge & remove BBs    // Clean up after everything. @@ -850,9 +844,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {    // alignments.    PM.add(createAlignmentFromAssumptionsPass()); -  if (LoadCombine) -    PM.add(createLoadCombinePass()); -    // Cleanup and simplify the code after the scalar optimizations.    addInstructionCombiningPass(PM);    addExtensionsToPM(EP_Peephole, PM); diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 67bc8f5f6b7a..656421ee58df 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -690,6 +690,9 @@ bool SampleProfileLoader::inlineHotFunctions(      for (auto I : CIS) {        InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);        Function *CalledFunction = CallSite(I).getCalledFunction(); +      // Do not inline recursive calls. +      if (CalledFunction == &F) +        continue;        Instruction *DI = I;        if (!CalledFunction && !PromotedInsns.count(I) &&            CallSite(I).isIndirectCall()) diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 287a5167fe2a..d5f0dd191415 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -988,15 +988,24 @@ static Instruction *foldAddWithConstant(BinaryOperator &Add,      return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);    } -  // Shifts and add used to flip and mask off the low bit: -  // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 -  const APInt *C3; -  if (C->isOneValue() && -      match(Op0, -            m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3)))) && -      C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { -    Value *NotX = Builder.CreateNot(X); -    return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); +  if (C->isOneValue() && Op0->hasOneUse()) { +    // add (sext i1 X), 1 --> zext (not X) +    // TODO: The smallest IR representation is (select X, 0, 1), and that would +    // not require the one-use check. But we need to remove a transform in +    // visitSelect and make sure that IR value tracking for select is equal or +    // better than for these ops. +    if (match(Op0, m_SExt(m_Value(X))) && +        X->getType()->getScalarSizeInBits() == 1) +      return new ZExtInst(Builder.CreateNot(X), Ty); + +    // Shifts and add used to flip and mask off the low bit: +    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 +    const APInt *C3; +    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) && +        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { +      Value *NotX = Builder.CreateNot(X); +      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); +    }    }    return nullptr; diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index a881bda5ba98..d3d8cefe9735 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1097,20 +1097,11 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,    Type *DestTy = Logic.getType();    Type *SrcTy = Cast->getSrcTy(); -  // If the first operand is bitcast, move the logic operation ahead of the -  // bitcast (do the logic operation in the original type). This can eliminate -  // bitcasts and allow combines that would otherwise be impeded by the bitcast. +  // Move the logic operation ahead of a zext if the constant is unchanged in +  // the smaller source type. Performing the logic in a smaller type may provide +  // more information to later folds, and the smaller logic instruction may be +  // cheaper (particularly in the case of vectors).    Value *X; -  if (match(Cast, m_BitCast(m_Value(X)))) { -    Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy); -    Value *NewOp = Builder->CreateBinOp(LogicOpc, X, NewConstant); -    return CastInst::CreateBitOrPointerCast(NewOp, DestTy); -  } - -  // Similarly, move the logic operation ahead of a zext if the constant is -  // unchanged in the smaller source type. Performing the logic in a smaller -  // type may provide more information to later folds, and the smaller logic -  // instruction may be cheaper (particularly in the case of vectors).    if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {      Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);      Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy); @@ -1239,9 +1230,10 @@ static Instruction *foldAndToXor(BinaryOperator &I,    // (A | ~B) & (B | ~A) --> ~(A ^ B)    // (~B | A) & (~A | B) --> ~(A ^ B)    // (~B | A) & (B | ~A) --> ~(A ^ B) -  if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && -      match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) -    return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); +  if (Op0->hasOneUse() || Op1->hasOneUse()) +    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && +        match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) +      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));    return nullptr;  } @@ -1256,9 +1248,10 @@ static Instruction *foldOrToXor(BinaryOperator &I,    // Operand complexity canonicalization guarantees that the 'and' is Op0.    // (A & B) | ~(A | B) --> ~(A ^ B)    // (A & B) | ~(B | A) --> ~(A ^ B) -  if (match(Op0, m_And(m_Value(A), m_Value(B))) && -      match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) -    return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); +  if (Op0->hasOneUse() || Op1->hasOneUse()) +    if (match(Op0, m_And(m_Value(A), m_Value(B))) && +        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) +      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));    // (A & ~B) | (~A & B) --> A ^ B    // (A & ~B) | (B & ~A) --> A ^ B @@ -1442,13 +1435,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {      // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C      if (match(Op0, m_Xor(m_Value(A), m_Value(B))))        if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) -        if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse()) +        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))            return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C));      // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C      if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))        if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) -        if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse()) +        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))            return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C));      // (A | B) & ((~A) ^ B) -> (A & B) @@ -1579,11 +1572,14 @@ static Value *getSelectCondition(Value *A, Value *B,    // If A and B are sign-extended, look through the sexts to find the booleans.    Value *Cond; +  Value *NotB;    if (match(A, m_SExt(m_Value(Cond))) &&        Cond->getType()->getScalarType()->isIntegerTy(1) && -      match(B, m_CombineOr(m_Not(m_SExt(m_Specific(Cond))), -                           m_SExt(m_Not(m_Specific(Cond)))))) -    return Cond; +      match(B, m_OneUse(m_Not(m_Value(NotB))))) { +    NotB = peekThroughBitcast(NotB, true); +    if (match(NotB, m_SExt(m_Specific(Cond)))) +      return Cond; +  }    // All scalar (and most vector) possibilities should be handled now.    // Try more matches that only apply to non-splat constant vectors. @@ -1615,12 +1611,8 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,    // The potential condition of the select may be bitcasted. In that case, look    // through its bitcast and the corresponding bitcast of the 'not' condition.    Type *OrigType = A->getType(); -  Value *SrcA, *SrcB; -  if (match(A, m_OneUse(m_BitCast(m_Value(SrcA)))) && -      match(B, m_OneUse(m_BitCast(m_Value(SrcB))))) { -    A = SrcA; -    B = SrcB; -  } +  A = peekThroughBitcast(A, true); +  B = peekThroughBitcast(B, true);    if (Value *Cond = getSelectCondition(A, B, Builder)) {      // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D)) @@ -1922,8 +1914,9 @@ Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {  ///     (A & C1) | B  ///  /// when the XOR of the two constants is "all ones" (-1). -Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, -                                               Value *A, Value *B, Value *C) { +static Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, +                                        Value *A, Value *B, Value *C, +                                        InstCombiner::BuilderTy *Builder) {    ConstantInt *CI1 = dyn_cast<ConstantInt>(C);    if (!CI1) return nullptr; @@ -1944,15 +1937,16 @@ Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,  /// \brief This helper function folds:  /// -///     ((A | B) & C1) ^ (B & C2) +///     ((A ^ B) & C1) | (B & C2)  ///  /// into:  ///  ///     (A & C1) ^ B  ///  /// when the XOR of the two constants is "all ones" (-1). -Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op, -                                                Value *A, Value *B, Value *C) { +static Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, +                                         Value *A, Value *B, Value *C, +                                         InstCombiner::BuilderTy *Builder) {    ConstantInt *CI1 = dyn_cast<ConstantInt>(C);    if (!CI1)      return nullptr; @@ -2112,46 +2106,36 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {      }      // ((A|B)&1)|(B&-2) -> (A&1) | B -    if (match(A, m_Or(m_Value(V1), m_Specific(B))) || -        match(A, m_Or(m_Specific(B), m_Value(V1)))) { -      Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C); -      if (Ret) return Ret; +    if (match(A, m_c_Or(m_Value(V1), m_Specific(B)))) { +      if (Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C, Builder)) +        return Ret;      }      // (B&-2)|((A|B)&1) -> (A&1) | B -    if (match(B, m_Or(m_Specific(A), m_Value(V1))) || -        match(B, m_Or(m_Value(V1), m_Specific(A)))) { -      Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D); -      if (Ret) return Ret; +    if (match(B, m_c_Or(m_Specific(A), m_Value(V1)))) { +      if (Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D, Builder)) +        return Ret;      }      // ((A^B)&1)|(B&-2) -> (A&1) ^ B -    if (match(A, m_Xor(m_Value(V1), m_Specific(B))) || -        match(A, m_Xor(m_Specific(B), m_Value(V1)))) { -      Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C); -      if (Ret) return Ret; +    if (match(A, m_c_Xor(m_Value(V1), m_Specific(B)))) { +      if (Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C, Builder)) +        return Ret;      }      // (B&-2)|((A^B)&1) -> (A&1) ^ B -    if (match(B, m_Xor(m_Specific(A), m_Value(V1))) || -        match(B, m_Xor(m_Value(V1), m_Specific(A)))) { -      Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D); -      if (Ret) return Ret; +    if (match(B, m_c_Xor(m_Specific(A), m_Value(V1)))) { +      if (Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D, Builder)) +        return Ret;      }    }    // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C -  // FIXME: The two hasOneUse calls here are the same call, maybe we were -  // supposed to check Op1->operand(0)?    if (match(Op0, m_Xor(m_Value(A), m_Value(B))))      if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) -      if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse()) -        return BinaryOperator::CreateOr(Op0, C); +      return BinaryOperator::CreateOr(Op0, C);    // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C -  // FIXME: The two hasOneUse calls here are the same call, maybe we were -  // supposed to check Op0->operand(0)?    if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))      if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) -      if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse()) -        return BinaryOperator::CreateOr(Op1, C); +      return BinaryOperator::CreateOr(Op1, C);    // ((B | C) & A) | B -> B | (A & C)    if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) @@ -2357,6 +2341,30 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {      }    } +  // Instead of trying to imitate the folds for and/or, decompose this 'xor' +  // into those logic ops. That is, try to turn this into an and-of-icmps +  // because we have many folds for that pattern. +  // +  // This is based on a truth table definition of xor: +  // X ^ Y --> (X | Y) & !(X & Y) +  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) { +    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y). +    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?). +    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) { +      // TODO: Independently handle cases where the 'and' side is a constant. +      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) { +        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS +        RHS->setPredicate(RHS->getInversePredicate()); +        return Builder->CreateAnd(LHS, RHS); +      } +      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) { +        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS +        LHS->setPredicate(LHS->getInversePredicate()); +        return Builder->CreateAnd(LHS, RHS); +      } +    } +  } +    return nullptr;  } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index c0830a5d2112..dbed7ad4eae8 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1409,6 +1409,47 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {      }    } +  // Add range metadata since known bits can't completely reflect what we know. +  // TODO: Handle splat vectors. +  auto *IT = dyn_cast<IntegerType>(Op0->getType()); +  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { +    Metadata *LowAndHigh[] = { +        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), +        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; +    II.setMetadata(LLVMContext::MD_range, +                   MDNode::get(II.getContext(), LowAndHigh)); +    return &II; +  } + +  return nullptr; +} + +static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { +  assert(II.getIntrinsicID() == Intrinsic::ctpop && +         "Expected ctpop intrinsic"); +  Value *Op0 = II.getArgOperand(0); +  // FIXME: Try to simplify vectors of integers. +  auto *IT = dyn_cast<IntegerType>(Op0->getType()); +  if (!IT) +    return nullptr; + +  unsigned BitWidth = IT->getBitWidth(); +  KnownBits Known(BitWidth); +  IC.computeKnownBits(Op0, Known, 0, &II); + +  unsigned MinCount = Known.countMinPopulation(); +  unsigned MaxCount = Known.countMaxPopulation(); + +  // Add range metadata since known bits can't completely reflect what we know. +  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { +    Metadata *LowAndHigh[] = { +        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), +        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; +    II.setMetadata(LLVMContext::MD_range, +                   MDNode::get(II.getContext(), LowAndHigh)); +    return &II; +  } +    return nullptr;  } @@ -1981,6 +2022,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {        return I;      break; +  case Intrinsic::ctpop: +    if (auto *I = foldCtpop(*II, *this)) +      return I; +    break; +    case Intrinsic::uadd_with_overflow:    case Intrinsic::sadd_with_overflow:    case Intrinsic::umul_with_overflow: diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 38e95fb11639..d3049389dfb9 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1896,6 +1896,18 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,      return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);    } +  // Canonicalize vector bitcasts to come before vector bitwise logic with a +  // constant. This eases recognition of special constants for later ops. +  // Example: +  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b +  Constant *C; +  if (match(BO->getOperand(1), m_Constant(C))) { +    // bitcast (logic X, C) --> logic (bitcast X, C') +    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); +    Value *CastedC = ConstantExpr::getBitCast(C, DestTy); +    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC); +  } +    return nullptr;  } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 1ef4acfb058c..6ad32490a328 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2434,6 +2434,77 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,    return nullptr;  } +bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, +                                           Value *&RHS, ConstantInt *&Less, +                                           ConstantInt *&Equal, +                                           ConstantInt *&Greater) { +  // TODO: Generalize this to work with other comparison idioms or ensure +  // they get canonicalized into this form. + +  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32 +  // Greater), where Equal, Less and Greater are placeholders for any three +  // constants. +  ICmpInst::Predicate PredA, PredB; +  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) && +      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) && +      PredA == ICmpInst::ICMP_EQ && +      match(SI->getFalseValue(), +            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)), +                     m_ConstantInt(Less), m_ConstantInt(Greater))) && +      PredB == ICmpInst::ICMP_SLT) { +    return true; +  } +  return false; +} + +Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp, +                                                  Instruction *Select, +                                                  ConstantInt *C) { + +  assert(C && "Cmp RHS should be a constant int!"); +  // If we're testing a constant value against the result of a three way +  // comparison, the result can be expressed directly in terms of the +  // original values being compared.  Note: We could possibly be more +  // aggressive here and remove the hasOneUse test. The original select is +  // really likely to simplify or sink when we remove a test of the result. +  Value *OrigLHS, *OrigRHS; +  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan; +  if (Cmp.hasOneUse() && +      matchThreeWayIntCompare(cast<SelectInst>(Select), OrigLHS, OrigRHS, +                                 C1LessThan, C2Equal, C3GreaterThan)) { +    assert(C1LessThan && C2Equal && C3GreaterThan); + +    bool TrueWhenLessThan = +        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C) +            ->isAllOnesValue(); +    bool TrueWhenEqual = +        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C) +            ->isAllOnesValue(); +    bool TrueWhenGreaterThan = +        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C) +            ->isAllOnesValue(); + +    // This generates the new instruction that will replace the original Cmp +    // Instruction. Instead of enumerating the various combinations when +    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus +    // false, we rely on chaining of ORs and future passes of InstCombine to +    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b). + +    // When none of the three constants satisfy the predicate for the RHS (C), +    // the entire original Cmp can be simplified to a false. +    Value *Cond = Builder->getFalse(); +    if (TrueWhenLessThan) +      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS)); +    if (TrueWhenEqual) +      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS)); +    if (TrueWhenGreaterThan) +      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS)); + +    return replaceInstUsesWith(Cmp, Cond); +  } +  return nullptr; +} +  /// Try to fold integer comparisons with a constant operand: icmp Pred X, C  /// where X is some kind of instruction.  Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { @@ -2493,11 +2564,28 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {        return I;    } +  // Match against CmpInst LHS being instructions other than binary operators.    Instruction *LHSI; -  if (match(Cmp.getOperand(0), m_Instruction(LHSI)) && -      LHSI->getOpcode() == Instruction::Trunc) -    if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C)) -      return I; +  if (match(Cmp.getOperand(0), m_Instruction(LHSI))) { +    switch (LHSI->getOpcode()) { +    case Instruction::Select: +      { +      // For now, we only support constant integers while folding the +      // ICMP(SELECT)) pattern. We can extend this to support vector of integers +      // similar to the cases handled by binary ops above. +      if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1))) +        if (Instruction *I = foldICmpSelectConstant(Cmp, LHSI, ConstRHS)) +          return I; +      break; +      } +    case Instruction::Trunc: +      if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C)) +        return I; +      break; +    default: +      break; +    } +  }    if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C))      return I; @@ -3110,8 +3198,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {    if (BO0) {      // Transform  A & (L - 1) `ult` L --> L != 0      auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); -    auto BitwiseAnd = -        m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); +    auto BitwiseAnd = m_c_And(m_Value(), LSubOne);      if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {        auto *Zero = Constant::getNullValue(BO0->getType()); diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 1a7db146df42..1b0fe84dd4dd 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -95,6 +95,18 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {    }  } +/// Return the source operand of a potentially bitcasted value while optionally +/// checking if it has one use. If there is no bitcast or the one use check is +/// not met, return the input value itself. +static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) { +  if (auto *BitCast = dyn_cast<BitCastInst>(V)) +    if (!OneUseOnly || BitCast->hasOneUse()) +      return BitCast->getOperand(0); + +  // V is not a bitcast or V has more than one use and OneUseOnly is true. +  return V; +} +  /// \brief Add one to a Constant  static inline Constant *AddOne(Constant *C) {    return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); @@ -276,10 +288,6 @@ public:    Instruction *visitFDiv(BinaryOperator &I);    Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);    Instruction *visitAnd(BinaryOperator &I); -  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A, -                                   Value *B, Value *C); -  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A, -                                    Value *B, Value *C);    Instruction *visitOr(BinaryOperator &I);    Instruction *visitXor(BinaryOperator &I);    Instruction *visitShl(BinaryOperator &I); @@ -595,6 +603,15 @@ private:                            Instruction::BinaryOps, Value *, Value *, Value *,                            Value *); +  /// Match a select chain which produces one of three values based on whether +  /// the LHS is less than, equal to, or greater than RHS respectively. +  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less, +  /// Equal and Greater values are saved in the matching process and returned to +  /// the caller. +  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS, +                               ConstantInt *&Less, ConstantInt *&Equal, +                               ConstantInt *&Greater); +    /// \brief Attempts to replace V with a simpler value based on the demanded    /// bits.    Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known, @@ -672,6 +689,8 @@ private:    Instruction *foldICmpBinOp(ICmpInst &Cmp);    Instruction *foldICmpEquality(ICmpInst &Cmp); +  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, Instruction *Select, +                                      ConstantInt *C);    Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc,                                       const APInt *C);    Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And, diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index a4d84ae81aa0..ca370c73fca4 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -169,6 +169,18 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI,    return nullptr;  } +/// Returns true if V is dereferenceable for size of alloca. +static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, +                                           const DataLayout &DL) { +  if (AI->isArrayAllocation()) +    return false; +  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); +  if (!AllocaSize) +    return false; +  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(), +                                            APInt(64, AllocaSize), DL); +} +  static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {    // Check for array size of 1 (scalar allocation).    if (!AI.isArrayAllocation()) { @@ -390,7 +402,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {      if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {        unsigned SourceAlign = getOrEnforceKnownAlignment(            Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT); -      if (AI.getAlignment() <= SourceAlign) { +      if (AI.getAlignment() <= SourceAlign && +          isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {          DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');          DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');          for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) @@ -476,21 +489,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT        break;      case LLVMContext::MD_nonnull: -      // This only directly applies if the new type is also a pointer. -      if (NewTy->isPointerTy()) { -        NewLoad->setMetadata(ID, N); -        break; -      } -      // If it's integral now, translate it to !range metadata. -      if (NewTy->isIntegerTy()) { -        auto *ITy = cast<IntegerType>(NewTy); -        auto *NullInt = ConstantExpr::getPtrToInt( -            ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); -        auto *NonNullInt = -            ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); -        NewLoad->setMetadata(LLVMContext::MD_range, -                             MDB.createRange(NonNullInt, NullInt)); -      } +      copyNonnullMetadata(LI, N, *NewLoad);        break;      case LLVMContext::MD_align:      case LLVMContext::MD_dereferenceable: @@ -500,17 +499,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT          NewLoad->setMetadata(ID, N);        break;      case LLVMContext::MD_range: -      // FIXME: It would be nice to propagate this in some way, but the type -      // conversions make it hard. - -      // If it's a pointer now and the range does not contain 0, make it !nonnull. -      if (NewTy->isPointerTy()) { -        unsigned BitWidth = IC.getDataLayout().getTypeSizeInBits(NewTy); -        if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { -          MDNode *NN = MDNode::get(LI.getContext(), None); -          NewLoad->setMetadata(LLVMContext::MD_nonnull, NN); -        } -      } +      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);        break;      }    } diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index b9674d85634d..33951e66497a 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -303,7 +303,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,  /// We want to turn:  ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))  /// into: -///   (or (shl (and X, C1), C3), y) +///   (or (shl (and X, C1), C3), Y)  /// iff:  ///   C1 and C2 are both powers of 2  /// where: @@ -317,19 +317,44 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,                                    Value *FalseVal,                                    InstCombiner::BuilderTy *Builder) {    const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); -  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy()) +  if (!IC || !SI.getType()->isIntegerTy())      return nullptr;    Value *CmpLHS = IC->getOperand(0);    Value *CmpRHS = IC->getOperand(1); -  if (!match(CmpRHS, m_Zero())) -    return nullptr; +  Value *V; +  unsigned C1Log; +  bool IsEqualZero; +  bool NeedAnd = false; +  if (IC->isEquality()) { +    if (!match(CmpRHS, m_Zero())) +      return nullptr; + +    const APInt *C1; +    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) +      return nullptr; + +    V = CmpLHS; +    C1Log = C1->logBase2(); +    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ; +  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT || +             IC->getPredicate() == ICmpInst::ICMP_SGT) { +    // We also need to recognize (icmp slt (trunc (X)), 0) and +    // (icmp sgt (trunc (X)), -1). +    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT; +    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) || +        (!IsEqualZero && !match(CmpRHS, m_Zero()))) +      return nullptr; + +    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V))))) +      return nullptr; -  Value *X; -  const APInt *C1; -  if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1)))) +    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1; +    NeedAnd = true; +  } else {      return nullptr; +  }    const APInt *C2;    bool OrOnTrueVal = false; @@ -340,11 +365,27 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,    if (!OrOnFalseVal && !OrOnTrueVal)      return nullptr; -  Value *V = CmpLHS;    Value *Y = OrOnFalseVal ? TrueVal : FalseVal; -  unsigned C1Log = C1->logBase2();    unsigned C2Log = C2->logBase2(); + +  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal); +  bool NeedShift = C1Log != C2Log; +  bool NeedZExtTrunc = Y->getType()->getIntegerBitWidth() != +                       V->getType()->getIntegerBitWidth(); + +  // Make sure we don't create more instructions than we save. +  Value *Or = OrOnFalseVal ? FalseVal : TrueVal; +  if ((NeedShift + NeedXor + NeedZExtTrunc) > +      (IC->hasOneUse() + Or->hasOneUse())) +    return nullptr; + +  if (NeedAnd) { +    // Insert the AND instruction on the input to the truncate. +    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log); +    V = Builder->CreateAnd(V, ConstantInt::get(V->getType(), C1)); +  } +    if (C2Log > C1Log) {      V = Builder->CreateZExtOrTrunc(V, Y->getType());      V = Builder->CreateShl(V, C2Log - C1Log); @@ -354,9 +395,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,    } else      V = Builder->CreateZExtOrTrunc(V, Y->getType()); -  ICmpInst::Predicate Pred = IC->getPredicate(); -  if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) || -      (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal)) +  if (NeedXor)      V = Builder->CreateXor(V, *C2);    return Builder->CreateOr(V, Y); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 8cec865c6422..1bb1a85367d1 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -556,8 +556,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {      // The inexact versions are deferred to DAGCombine, so we don't hide shl      // behind a bit mask.      const APInt *ShOp1; -    if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShOp1))), -                               m_Exact(m_AShr(m_Value(X), m_APInt(ShOp1)))))) { +    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {        unsigned ShrAmt = ShOp1->getZExtValue();        if (ShrAmt < ShAmt) {          // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1) diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 65e6d2e35905..02fac4fb37a4 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -939,9 +939,19 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {        // `TrueVInPred`.        if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))          InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; -      else +      else { +        // Generate the select in the same block as PN's current incoming block. +        // Note: ThisBB need not be the NonConstBB because vector constants +        // which are constants by definition are handled here. +        // FIXME: This can lead to an increase in IR generation because we might +        // generate selects for vector constant phi operand, that could not be +        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For +        // non-vector phis, this transformation was always profitable because +        // the select would be generated exactly once in the NonConstBB. +        Builder->SetInsertPoint(ThisBB->getTerminator());          InV = Builder->CreateSelect(PN->getIncomingValue(i),                                      TrueVInPred, FalseVInPred, "phitmp"); +      }        NewPN->addIncoming(InV, ThisBB);      }    } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { @@ -3002,6 +3012,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,          ++NumDeadInst;          DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');          Inst->eraseFromParent(); +        MadeIRChange = true;          continue;        } @@ -3015,6 +3026,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,            ++NumConstProp;            if (isInstructionTriviallyDead(Inst, TLI))              Inst->eraseFromParent(); +          MadeIRChange = true;            continue;          } diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 0d308810009d..4089d81ea3e1 100644 --- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -642,7 +642,12 @@ static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {    if (DisableICP)      return false;    InstrProfSymtab Symtab; -  Symtab.create(M, InLTO); +  if (Error E = Symtab.create(M, InLTO)) { +    std::string SymtabFailure = toString(std::move(E)); +    DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n"); +    (void)SymtabFailure; +    return false; +  }    bool Changed = false;    for (auto &F : M) {      if (F.isDeclaration()) diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp index 37f88d5f95f1..9c14b0149fdc 100644 --- a/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -19,12 +19,14 @@  #include "llvm/ADT/StringRef.h"  #include "llvm/ADT/Triple.h"  #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/IR/Attributes.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/Constant.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/GlobalValue.h"  #include "llvm/IR/GlobalVariable.h" @@ -40,7 +42,10 @@  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Error.h"  #include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h"  #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h"  #include <algorithm>  #include <cassert>  #include <cstddef> @@ -92,6 +97,35 @@ cl::opt<double> NumCountersPerValueSite(      // is usually smaller than 2.      cl::init(1.0)); +cl::opt<bool> AtomicCounterUpdatePromoted( +    "atomic-counter-update-promoted", cl::ZeroOrMore, +    cl::desc("Do counter update using atomic fetch add " +             " for promoted counters only"), +    cl::init(false)); + +// If the option is not specified, the default behavior about whether +// counter promotion is done depends on how instrumentaiton lowering +// pipeline is setup, i.e., the default value of true of this option +// does not mean the promotion will be done by default. Explicitly +// setting this option can override the default behavior. +cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore, +                                 cl::desc("Do counter register promotion"), +                                 cl::init(false)); +cl::opt<unsigned> MaxNumOfPromotionsPerLoop( +    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(10), +    cl::desc("Max number counter promotions per loop to avoid" +             " increasing register pressure too much")); + +// A debug option +cl::opt<int> +    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1), +                       cl::desc("Max number of allowed counter promotions")); + +cl::opt<bool> SpeculativeCounterPromotion( +    cl::ZeroOrMore, "speculative-counter-promotion", cl::init(false), +    cl::desc("Allow counter promotion for loops with multiple exiting blocks " +             " or top-tested loops. ")); +  class InstrProfilingLegacyPass : public ModulePass {    InstrProfiling InstrProf; @@ -116,6 +150,123 @@ public:    }  }; +/// A helper class to promote one counter RMW operation in the loop +/// into register update. +/// +/// RWM update for the counter will be sinked out of the loop after +/// the transformation. +/// +class PGOCounterPromoterHelper : public LoadAndStorePromoter { +public: +  PGOCounterPromoterHelper(Instruction *L, Instruction *S, SSAUpdater &SSA, +                           Value *Init, BasicBlock *PH, +                           ArrayRef<BasicBlock *> ExitBlocks, +                           ArrayRef<Instruction *> InsertPts) +      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks), +        InsertPts(InsertPts) { +    assert(isa<LoadInst>(L)); +    assert(isa<StoreInst>(S)); +    SSA.AddAvailableValue(PH, Init); +  } +  void doExtraRewritesBeforeFinalDeletion() const override { +    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { +      BasicBlock *ExitBlock = ExitBlocks[i]; +      Instruction *InsertPos = InsertPts[i]; +      // Get LiveIn value into the ExitBlock. If there are multiple +      // predecessors, the value is defined by a PHI node in this +      // block. +      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); +      Value *Addr = cast<StoreInst>(Store)->getPointerOperand(); +      IRBuilder<> Builder(InsertPos); +      if (AtomicCounterUpdatePromoted) +        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue, +                                AtomicOrdering::SequentiallyConsistent); +      else { +        LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted"); +        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue); +        Builder.CreateStore(NewVal, Addr); +      } +    } +  } + +private: +  Instruction *Store; +  ArrayRef<BasicBlock *> ExitBlocks; +  ArrayRef<Instruction *> InsertPts; +}; + +/// A helper class to do register promotion for all profile counter +/// updates in a loop. +/// +class PGOCounterPromoter { +public: +  PGOCounterPromoter(ArrayRef<LoadStorePair> Cands, Loop &Loop) +      : Candidates(Cands), ExitBlocks(), InsertPts(), ParentLoop(Loop) { + +    SmallVector<BasicBlock *, 8> LoopExitBlocks; +    SmallPtrSet<BasicBlock *, 8> BlockSet; +    ParentLoop.getExitBlocks(LoopExitBlocks); + +    for (BasicBlock *ExitBlock : LoopExitBlocks) { +      if (BlockSet.insert(ExitBlock).second) { +        ExitBlocks.push_back(ExitBlock); +        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); +      } +    } +  } + +  bool run(int64_t *NumPromoted) { +    // We can't insert into a catchswitch. +    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { +      return isa<CatchSwitchInst>(Exit->getTerminator()); +    }); + +    if (HasCatchSwitch) +      return false; + +    if (!ParentLoop.hasDedicatedExits()) +      return false; + +    BasicBlock *PH = ParentLoop.getLoopPreheader(); +    if (!PH) +      return false; + +    BasicBlock *H = ParentLoop.getHeader(); +    bool TopTested = +        ((ParentLoop.getBlocks().size() > 1) && ParentLoop.isLoopExiting(H)); +    if (!SpeculativeCounterPromotion && +        (TopTested || ParentLoop.getExitingBlock() == nullptr)) +      return false; + +    unsigned Promoted = 0; +    for (auto &Cand : Candidates) { + +      SmallVector<PHINode *, 4> NewPHIs; +      SSAUpdater SSA(&NewPHIs); +      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0); +      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal, +                                        PH, ExitBlocks, InsertPts); +      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second})); +      Promoted++; +      if (Promoted >= MaxNumOfPromotionsPerLoop) +        break; +      (*NumPromoted)++; +      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions) +        break; +    } + +    DEBUG(dbgs() << Promoted << " counters promoted for loop (depth=" +                 << ParentLoop.getLoopDepth() << ")\n"); +    return Promoted != 0; +  } + +private: +  ArrayRef<LoadStorePair> Candidates; +  SmallVector<BasicBlock *, 8> ExitBlocks; +  SmallVector<Instruction *, 8> InsertPts; +  Loop &ParentLoop; +}; +  } // end anonymous namespace  PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { @@ -147,6 +298,63 @@ static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {    return dyn_cast<InstrProfIncrementInst>(Instr);  } +bool InstrProfiling::lowerIntrinsics(Function *F) { +  bool MadeChange = false; +  PromotionCandidates.clear(); +  for (BasicBlock &BB : *F) { +    for (auto I = BB.begin(), E = BB.end(); I != E;) { +      auto Instr = I++; +      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr); +      if (Inc) { +        lowerIncrement(Inc); +        MadeChange = true; +      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { +        lowerValueProfileInst(Ind); +        MadeChange = true; +      } +    } +  } + +  if (!MadeChange) +    return false; + +  promoteCounterLoadStores(F); +  return true; +} + +bool InstrProfiling::isCounterPromotionEnabled() const { +  if (DoCounterPromotion.getNumOccurrences() > 0) +    return DoCounterPromotion; + +  return Options.DoCounterPromotion; +} + +void InstrProfiling::promoteCounterLoadStores(Function *F) { +  if (!isCounterPromotionEnabled()) +    return; + +  DominatorTree DT(*F); +  LoopInfo LI(DT); +  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates; + +  for (const auto &LoadStore : PromotionCandidates) { +    auto *CounterLoad = LoadStore.first; +    auto *CounterStore = LoadStore.second; +    BasicBlock *BB = CounterLoad->getParent(); +    Loop *ParentLoop = LI.getLoopFor(BB); +    if (!ParentLoop) +      continue; +    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore); +  } + +  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder(); + +  for (auto *Loop : Loops) { +    PGOCounterPromoter Promoter(LoopPromotionCandidates[Loop], *Loop); +    Promoter.run(&TotalCountersPromoted); +  } +} +  bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {    bool MadeChange = false; @@ -179,18 +387,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {    }    for (Function &F : M) -    for (BasicBlock &BB : F) -      for (auto I = BB.begin(), E = BB.end(); I != E;) { -        auto Instr = I++; -        InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr); -        if (Inc) { -          lowerIncrement(Inc); -          MadeChange = true; -        } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { -          lowerValueProfileInst(Ind); -          MadeChange = true; -        } -      } +    MadeChange |= lowerIntrinsics(&F);    if (GlobalVariable *CoverageNamesVar =            M.getNamedGlobal(getCoverageUnusedNamesVarName())) { @@ -303,9 +500,12 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {    IRBuilder<> Builder(Inc);    uint64_t Index = Inc->getIndex()->getZExtValue();    Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); -  Value *Count = Builder.CreateLoad(Addr, "pgocount"); -  Count = Builder.CreateAdd(Count, Inc->getStep()); -  Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr)); +  Value *Load = Builder.CreateLoad(Addr, "pgocount"); +  auto *Count = Builder.CreateAdd(Load, Inc->getStep()); +  auto *Store = Builder.CreateStore(Count, Addr); +  Inc->replaceAllUsesWith(Store); +  if (isCounterPromotionEnabled()) +    PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);    Inc->eraseFromParent();  } diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index b2d95271479c..0e7d11c55397 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1177,7 +1177,7 @@ void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {    Builder.CreateCall(        Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),        {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), -       Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty), +       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),         Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});    ++CurCtrId;  } diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index f5196cc46181..457c9427ab9a 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -22,7 +22,6 @@ add_llvm_library(LLVMScalarOpts    LICM.cpp    LoopAccessAnalysisPrinter.cpp    LoopSink.cpp -  LoadCombine.cpp    LoopDeletion.cpp    LoopDataPrefetch.cpp    LoopDistribute.cpp diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 2a4c9526dfcd..28157783daa7 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -232,8 +232,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {    pred_iterator PB = pred_begin(BB), PE = pred_end(BB);    if (PB == PE) return false; -  // Analyse each switch case in turn.  This is done in reverse order so that -  // removing a case doesn't cause trouble for the iteration. +  // Analyse each switch case in turn.    bool Changed = false;    for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {      ConstantInt *Case = CI->getCaseValue(); @@ -291,7 +290,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {        break;      } -    // Increment the case iterator sense we didn't delete it. +    // Increment the case iterator since we didn't delete it.      ++CI;    } diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0490d93f6455..c0f628eb61e6 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,  struct llvm::GVN::Expression {    uint32_t opcode;    Type *type; +  bool commutative;    SmallVector<uint32_t, 4> varargs; -  Expression(uint32_t o = ~2U) : opcode(o) {} +  Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {}    bool operator==(const Expression &other) const {      if (opcode != other.opcode) @@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {      assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");      if (e.varargs[0] > e.varargs[1])        std::swap(e.varargs[0], e.varargs[1]); +    e.commutative = true;    }    if (CmpInst *C = dyn_cast<CmpInst>(I)) { @@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {        Predicate = CmpInst::getSwappedPredicate(Predicate);      }      e.opcode = (C->getOpcode() << 8) | Predicate; +    e.commutative = true;    } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {      for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();           II != IE; ++II) @@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,      Predicate = CmpInst::getSwappedPredicate(Predicate);    }    e.opcode = (Opcode << 8) | Predicate; +  e.commutative = true;    return e;  } @@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default;  /// add - Insert a value into the table with a specified value number.  void GVN::ValueTable::add(Value *V, uint32_t num) {    valueNumbering.insert(std::make_pair(V, num)); +  if (PHINode *PN = dyn_cast<PHINode>(V)) +    NumberingPhi[num] = PN;  }  uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {    if (AA->doesNotAccessMemory(C)) {      Expression exp = createExpr(C); -    uint32_t &e = expressionNumbering[exp]; -    if (!e) e = nextValueNumber++; +    uint32_t e = assignExpNewValueNum(exp).first;      valueNumbering[C] = e;      return e;    } else if (AA->onlyReadsMemory(C)) {      Expression exp = createExpr(C); -    uint32_t &e = expressionNumbering[exp]; -    if (!e) { -      e = nextValueNumber++; -      valueNumbering[C] = e; -      return e; +    auto ValNum = assignExpNewValueNum(exp); +    if (ValNum.second) { +      valueNumbering[C] = ValNum.first; +      return ValNum.first;      }      if (!MD) { -      e = nextValueNumber++; +      uint32_t e = assignExpNewValueNum(exp).first;        valueNumbering[C] = e;        return e;      } @@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {      case Instruction::ExtractValue:        exp = createExtractvalueExpr(cast<ExtractValueInst>(I));        break; +    case Instruction::PHI: +      valueNumbering[V] = nextValueNumber; +      NumberingPhi[nextValueNumber] = cast<PHINode>(V); +      return nextValueNumber++;      default:        valueNumbering[V] = nextValueNumber;        return nextValueNumber++;    } -  uint32_t& e = expressionNumbering[exp]; -  if (!e) e = nextValueNumber++; +  uint32_t e = assignExpNewValueNum(exp).first;    valueNumbering[V] = e;    return e;  }  /// Returns the value number of the specified value. Fails if  /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {    DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); -  assert(VI != valueNumbering.end() && "Value not numbered?"); -  return VI->second; +  if (Verify) { +    assert(VI != valueNumbering.end() && "Value not numbered?"); +    return VI->second; +  } +  return (VI != valueNumbering.end()) ? VI->second : 0;  }  /// Returns the value number of the given comparison, @@ -549,21 +559,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,                                           CmpInst::Predicate Predicate,                                           Value *LHS, Value *RHS) {    Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); -  uint32_t& e = expressionNumbering[exp]; -  if (!e) e = nextValueNumber++; -  return e; +  return assignExpNewValueNum(exp).first;  }  /// Remove all entries from the ValueTable.  void GVN::ValueTable::clear() {    valueNumbering.clear();    expressionNumbering.clear(); +  NumberingPhi.clear(); +  PhiTranslateTable.clear();    nextValueNumber = 1; +  Expressions.clear(); +  ExprIdx.clear(); +  nextExprNumber = 0;  }  /// Remove a value from the value numbering.  void GVN::ValueTable::erase(Value *V) { +  uint32_t Num = valueNumbering.lookup(V);    valueNumbering.erase(V); +  // If V is PHINode, V <--> value number is an one-to-one mapping. +  if (isa<PHINode>(V)) +    NumberingPhi.erase(Num);  }  /// verifyRemoved - Verify that the value is removed from all internal data @@ -602,7 +619,7 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) { +LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {    errs() << "{\n";    for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),         E = d.end(); I != E; ++I) { @@ -1451,6 +1468,95 @@ bool GVN::processLoad(LoadInst *L) {    return false;  } +/// Return a pair the first field showing the value number of \p Exp and the +/// second field showing whether it is a value number newly created. +std::pair<uint32_t, bool> +GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { +  uint32_t &e = expressionNumbering[Exp]; +  bool CreateNewValNum = !e; +  if (CreateNewValNum) { +    Expressions.push_back(Exp); +    if (ExprIdx.size() < nextValueNumber + 1) +      ExprIdx.resize(nextValueNumber * 2); +    e = nextValueNumber; +    ExprIdx[nextValueNumber++] = nextExprNumber++; +  } +  return {e, CreateNewValNum}; +} + +/// Return whether all the values related with the same \p num are +/// defined in \p BB. +bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, +                                     GVN &Gvn) { +  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; +  while (Vals && Vals->BB == BB) +    Vals = Vals->Next; +  return !Vals; +} + +/// Wrap phiTranslateImpl to provide caching functionality. +uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, +                                       const BasicBlock *PhiBlock, uint32_t Num, +                                       GVN &Gvn) { +  auto FindRes = PhiTranslateTable.find({Num, Pred}); +  if (FindRes != PhiTranslateTable.end()) +    return FindRes->second; +  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); +  PhiTranslateTable.insert({{Num, Pred}, NewNum}); +  return NewNum; +} + +/// Translate value number \p Num using phis, so that it has the values of +/// the phis in BB. +uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, +                                           const BasicBlock *PhiBlock, +                                           uint32_t Num, GVN &Gvn) { +  if (PHINode *PN = NumberingPhi[Num]) { +    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { +      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) +        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) +          return TransVal; +    } +    return Num; +  } + +  // If there is any value related with Num is defined in a BB other than +  // PhiBlock, it cannot depend on a phi in PhiBlock without going through +  // a backedge. We can do an early exit in that case to save compile time. +  if (!areAllValsInBB(Num, PhiBlock, Gvn)) +    return Num; + +  if (Num >= ExprIdx.size() || ExprIdx[Num] == 0) +    return Num; +  Expression Exp = Expressions[ExprIdx[Num]]; + +  for (unsigned i = 0; i < Exp.varargs.size(); i++) { +    // For InsertValue and ExtractValue, some varargs are index numbers +    // instead of value numbers. Those index numbers should not be +    // translated. +    if ((i > 1 && Exp.opcode == Instruction::InsertValue) || +        (i > 0 && Exp.opcode == Instruction::ExtractValue)) +      continue; +    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); +  } + +  if (Exp.commutative) { +    assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); +    if (Exp.varargs[0] > Exp.varargs[1]) { +      std::swap(Exp.varargs[0], Exp.varargs[1]); +      uint32_t Opcode = Exp.opcode >> 8; +      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) +        Exp.opcode = (Opcode << 8) | +                     CmpInst::getSwappedPredicate( +                         static_cast<CmpInst::Predicate>(Exp.opcode & 255)); +    } +  } + +  if (uint32_t NewNum = expressionNumbering[Exp]) +    return NewNum; +  return Num; +} +  // In order to find a leader for a given value number at a  // specific basic block, we first obtain the list of all Values for that number,  // and then scan the list to find one whose block dominates the block in @@ -1495,6 +1601,15 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,    return Pred != nullptr;  } + +void GVN::assignBlockRPONumber(Function &F) { +  uint32_t NextBlockNumber = 1; +  ReversePostOrderTraversal<Function *> RPOT(&F); +  for (BasicBlock *BB : RPOT) +    BlockRPONumber[BB] = NextBlockNumber++; +} + +  // Tries to replace instruction with const, using information from  // ReplaceWithConstMap.  bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { @@ -1856,6 +1971,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,      // Fabricate val-num for dead-code in order to suppress assertion in      // performPRE().      assignValNumForDeadCode(); +    assignBlockRPONumber(F);      bool PREChanged = true;      while (PREChanged) {        PREChanged = performPRE(F); @@ -1927,7 +2043,7 @@ bool GVN::processBlock(BasicBlock *BB) {  // Instantiate an expression in a predecessor that lacked it.  bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, -                                    unsigned int ValNo) { +                                    BasicBlock *Curr, unsigned int ValNo) {    // Because we are going top-down through the block, all value numbers    // will be available in the predecessor by the time we need them.  Any    // that weren't originally present will have been instantiated earlier @@ -1945,7 +2061,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,        success = false;        break;      } -    if (Value *V = findLeader(Pred, VN.lookup(Op))) { +    uint32_t TValNo = +        VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this); +    if (Value *V = findLeader(Pred, TValNo)) {        Instr->setOperand(i, V);      } else {        success = false; @@ -1962,10 +2080,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,    Instr->insertBefore(Pred->getTerminator());    Instr->setName(Instr->getName() + ".pre");    Instr->setDebugLoc(Instr->getDebugLoc()); -  VN.add(Instr, ValNo); + +  unsigned Num = VN.lookupOrAdd(Instr); +  VN.add(Instr, Num);    // Update the availability map to include the new instruction. -  addToLeaderTable(ValNo, Instr, Pred); +  addToLeaderTable(Num, Instr, Pred);    return true;  } @@ -2003,18 +2123,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) {    SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;    for (BasicBlock *P : predecessors(CurrentBlock)) { -    // We're not interested in PRE where the block is its -    // own predecessor, or in blocks with predecessors -    // that are not reachable. -    if (P == CurrentBlock) { +    // We're not interested in PRE where blocks with predecessors that are +    // not reachable. +    if (!DT->isReachableFromEntry(P)) {        NumWithout = 2;        break; -    } else if (!DT->isReachableFromEntry(P)) { +    } +    // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and +    // when CurInst has operand defined in CurrentBlock (so it may be defined +    // by phi in the loop header). +    if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] && +        any_of(CurInst->operands(), [&](const Use &U) { +          if (auto *Inst = dyn_cast<Instruction>(U.get())) +            return Inst->getParent() == CurrentBlock; +          return false; +        })) {        NumWithout = 2;        break;      } -    Value *predV = findLeader(P, ValNo); +    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); +    Value *predV = findLeader(P, TValNo);      if (!predV) {        predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));        PREPred = P; @@ -2054,7 +2183,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {      }      // We need to insert somewhere, so let's give it a shot      PREInstr = CurInst->clone(); -    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { +    if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {        // If we failed insertion, make sure we remove the instruction.        DEBUG(verifyRemoved(PREInstr));        PREInstr->deleteValue(); @@ -2168,6 +2297,7 @@ bool GVN::iterateOnFunction(Function &F) {  void GVN::cleanupGlobalSets() {    VN.clear();    LeaderTable.clear(); +  BlockRPONumber.clear();    TableAllocator.Reset();  } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index c120036464d0..05293eb0079f 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -25,6 +25,7 @@  #include "llvm/Analysis/Loads.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/ConstantRange.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/LLVMContext.h" @@ -576,7 +577,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(    // Handle compare with phi operand, where the PHI is defined in this block.    if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {      assert(Preference == WantInteger && "Compares only produce integers"); -    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); +    Type *CmpType = Cmp->getType(); +    Value *CmpLHS = Cmp->getOperand(0); +    Value *CmpRHS = Cmp->getOperand(1); +    CmpInst::Predicate Pred = Cmp->getPredicate(); + +    PHINode *PN = dyn_cast<PHINode>(CmpLHS);      if (PN && PN->getParent() == BB) {        const DataLayout &DL = PN->getModule()->getDataLayout();        // We can do this simplification if any comparisons fold to true or false. @@ -584,15 +590,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(        for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {          BasicBlock *PredBB = PN->getIncomingBlock(i);          Value *LHS = PN->getIncomingValue(i); -        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); +        Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB); -        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, {DL}); +        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});          if (!Res) {            if (!isa<Constant>(RHS))              continue;            LazyValueInfo::Tristate -            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, +            ResT = LVI->getPredicateOnEdge(Pred, LHS,                                             cast<Constant>(RHS), PredBB, BB,                                             CxtI ? CxtI : Cmp);            if (ResT == LazyValueInfo::Unknown) @@ -609,27 +615,67 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(      // If comparing a live-in value against a constant, see if we know the      // live-in value on any predecessors. -    if (isa<Constant>(Cmp->getOperand(1)) && !Cmp->getType()->isVectorTy()) { -      Constant *CmpConst = cast<Constant>(Cmp->getOperand(1)); +    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) { +      Constant *CmpConst = cast<Constant>(CmpRHS); -      if (!isa<Instruction>(Cmp->getOperand(0)) || -          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { +      if (!isa<Instruction>(CmpLHS) || +          cast<Instruction>(CmpLHS)->getParent() != BB) {          for (BasicBlock *P : predecessors(BB)) {            // If the value is known by LazyValueInfo to be a constant in a            // predecessor, use that information to try to thread this block.            LazyValueInfo::Tristate Res = -            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), +            LVI->getPredicateOnEdge(Pred, CmpLHS,                                      CmpConst, P, BB, CxtI ? CxtI : Cmp);            if (Res == LazyValueInfo::Unknown)              continue; -          Constant *ResC = ConstantInt::get(Cmp->getType(), Res); +          Constant *ResC = ConstantInt::get(CmpType, Res);            Result.push_back(std::make_pair(ResC, P));          }          return !Result.empty();        } +      // InstCombine can fold some forms of constant range checks into +      // (icmp (add (x, C1)), C2). See if we have we have such a thing with +      // x as a live-in. +      { +        using namespace PatternMatch; +        Value *AddLHS; +        ConstantInt *AddConst; +        if (isa<ConstantInt>(CmpConst) && +            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { +          if (!isa<Instruction>(AddLHS) || +              cast<Instruction>(AddLHS)->getParent() != BB) { +            for (BasicBlock *P : predecessors(BB)) { +              // If the value is known by LazyValueInfo to be a ConstantRange in +              // a predecessor, use that information to try to thread this +              // block. +              ConstantRange CR = LVI->getConstantRangeOnEdge( +                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS)); +              // Propagate the range through the addition. +              CR = CR.add(AddConst->getValue()); + +              // Get the range where the compare returns true. +              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( +                  Pred, cast<ConstantInt>(CmpConst)->getValue()); + +              Constant *ResC; +              if (CmpRange.contains(CR)) +                ResC = ConstantInt::getTrue(CmpType); +              else if (CmpRange.inverse().contains(CR)) +                ResC = ConstantInt::getFalse(CmpType); +              else +                continue; + +              Result.push_back(std::make_pair(ResC, P)); +            } + +            return !Result.empty(); +          } +        } +      } +        // Try to find a constant value for the LHS of a comparison,        // and evaluate it statically if we can.        PredValueInfoTy LHSVals; @@ -638,8 +684,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(        for (const auto &LHSVal : LHSVals) {          Constant *V = LHSVal.first; -        Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), -                                                    V, CmpConst); +        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);          if (Constant *KC = getKnownConstant(Folded, WantInteger))            Result.push_back(std::make_pair(KC, LHSVal.second));        } @@ -752,6 +797,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {        LVI->eraseBlock(SinglePred);        MergeBasicBlockIntoOnlyPred(BB); +      // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by +      // BB code within one basic block `BB`), we need to invalidate the LVI +      // information associated with BB, because the LVI information need not be +      // true for all of BB after the merge. For example, +      // Before the merge, LVI info and code is as follows: +      // SinglePred: <LVI info1 for %p val> +      // %y = use of %p +      // call @exit() // need not transfer execution to successor. +      // assume(%p) // from this point on %p is true +      // br label %BB +      // BB: <LVI info2 for %p val, i.e. %p is true> +      // %x = use of %p +      // br label exit +      // +      // Note that this LVI info for blocks BB and SinglPred is correct for %p +      // (info2 and info1 respectively). After the merge and the deletion of the +      // LVI info1 for SinglePred. We have the following code: +      // BB: <LVI info2 for %p val> +      // %y = use of %p +      // call @exit() +      // assume(%p) +      // %x = use of %p <-- LVI info2 is correct from here onwards. +      // br label exit +      // LVI info2 for BB is incorrect at the beginning of BB. + +      // Invalidate LVI information for BB if the LVI is not provably true for +      // all of BB. +      if (any_of(*BB, [](Instruction &I) { +            return !isGuaranteedToTransferExecutionToSuccessor(&I); +          })) +        LVI->eraseBlock(BB);        return true;      }    } diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp deleted file mode 100644 index 025ba1bfedc1..000000000000 --- a/lib/Transforms/Scalar/LoadCombine.cpp +++ /dev/null @@ -1,295 +0,0 @@ -//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This transformation combines adjacent loads. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/TargetFolder.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" - -using namespace llvm; - -#define DEBUG_TYPE "load-combine" - -STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); -STATISTIC(NumLoadsCombined, "Number of loads combined"); - -#define LDCOMBINE_NAME "Combine Adjacent Loads" - -namespace { -struct PointerOffsetPair { -  Value *Pointer; -  APInt Offset; -}; - -struct LoadPOPPair { -  LoadInst *Load; -  PointerOffsetPair POP; -  /// \brief The new load needs to be created before the first load in IR order. -  unsigned InsertOrder; -}; - -class LoadCombine : public BasicBlockPass { -  LLVMContext *C; -  AliasAnalysis *AA; -  DominatorTree *DT; - -public: -  LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { -    initializeLoadCombinePass(*PassRegistry::getPassRegistry()); -  } - -  using llvm::Pass::doInitialization; -  bool doInitialization(Function &) override; -  bool runOnBasicBlock(BasicBlock &BB) override; -  void getAnalysisUsage(AnalysisUsage &AU) const override { -    AU.setPreservesCFG(); -    AU.addRequired<AAResultsWrapperPass>(); -    AU.addRequired<DominatorTreeWrapperPass>(); -    AU.addPreserved<GlobalsAAWrapperPass>(); -  } - -  StringRef getPassName() const override { return LDCOMBINE_NAME; } -  static char ID; - -  typedef IRBuilder<TargetFolder> BuilderTy; - -private: -  BuilderTy *Builder; - -  PointerOffsetPair getPointerOffsetPair(LoadInst &); -  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); -  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); -  bool combineLoads(SmallVectorImpl<LoadPOPPair> &); -}; -} - -bool LoadCombine::doInitialization(Function &F) { -  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); -  C = &F.getContext(); -  return true; -} - -PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { -  auto &DL = LI.getModule()->getDataLayout(); - -  PointerOffsetPair POP; -  POP.Pointer = LI.getPointerOperand(); -  unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace()); -  POP.Offset = APInt(BitWidth, 0); - -  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { -    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { -      APInt LastOffset = POP.Offset; -      if (!GEP->accumulateConstantOffset(DL, POP.Offset)) { -        // Can't handle GEPs with variable indices. -        POP.Offset = LastOffset; -        return POP; -      } -      POP.Pointer = GEP->getPointerOperand(); -    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) { -      POP.Pointer = BC->getOperand(0); -    } -  } -  return POP; -} - -bool LoadCombine::combineLoads( -    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { -  bool Combined = false; -  for (auto &Loads : LoadMap) { -    if (Loads.second.size() < 2) -      continue; -    std::sort(Loads.second.begin(), Loads.second.end(), -              [](const LoadPOPPair &A, const LoadPOPPair &B) { -                return A.POP.Offset.slt(B.POP.Offset); -              }); -    if (aggregateLoads(Loads.second)) -      Combined = true; -  } -  return Combined; -} - -/// \brief Try to aggregate loads from a sorted list of loads to be combined. -/// -/// It is guaranteed that no writes occur between any of the loads. All loads -/// have the same base pointer. There are at least two loads. -bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { -  assert(Loads.size() >= 2 && "Insufficient loads!"); -  LoadInst *BaseLoad = nullptr; -  SmallVector<LoadPOPPair, 8> AggregateLoads; -  bool Combined = false; -  bool ValidPrevOffset = false; -  APInt PrevOffset; -  uint64_t PrevSize = 0; -  for (auto &L : Loads) { -    if (ValidPrevOffset == false) { -      BaseLoad = L.Load; -      PrevOffset = L.POP.Offset; -      PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( -          L.Load->getType()); -      AggregateLoads.push_back(L); -      ValidPrevOffset = true; -      continue; -    } -    if (L.Load->getAlignment() > BaseLoad->getAlignment()) -      continue; -    APInt PrevEnd = PrevOffset + PrevSize; -    if (L.POP.Offset.sgt(PrevEnd)) { -      // No other load will be combinable -      if (combineLoads(AggregateLoads)) -        Combined = true; -      AggregateLoads.clear(); -      ValidPrevOffset = false; -      continue; -    } -    if (L.POP.Offset != PrevEnd) -      // This load is offset less than the size of the last load. -      // FIXME: We may want to handle this case. -      continue; -    PrevOffset = L.POP.Offset; -    PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( -        L.Load->getType()); -    AggregateLoads.push_back(L); -  } -  if (combineLoads(AggregateLoads)) -    Combined = true; -  return Combined; -} - -/// \brief Given a list of combinable load. Combine the maximum number of them. -bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { -  // Remove loads from the end while the size is not a power of 2. -  unsigned TotalSize = 0; -  for (const auto &L : Loads) -    TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); -  while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) -    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); -  if (Loads.size() < 2) -    return false; - -  DEBUG({ -    dbgs() << "***** Combining Loads ******\n"; -    for (const auto &L : Loads) { -      dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; -    } -  }); - -  // Find first load. This is where we put the new load. -  LoadPOPPair FirstLP; -  FirstLP.InsertOrder = -1u; -  for (const auto &L : Loads) -    if (L.InsertOrder < FirstLP.InsertOrder) -      FirstLP = L; - -  unsigned AddressSpace = -      FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); - -  Builder->SetInsertPoint(FirstLP.Load); -  Value *Ptr = Builder->CreateConstGEP1_64( -      Builder->CreatePointerCast(Loads[0].POP.Pointer, -                                 Builder->getInt8PtrTy(AddressSpace)), -      Loads[0].POP.Offset.getSExtValue()); -  LoadInst *NewLoad = new LoadInst( -      Builder->CreatePointerCast( -          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), -                                Ptr->getType()->getPointerAddressSpace())), -      Twine(Loads[0].Load->getName()) + ".combined", false, -      Loads[0].Load->getAlignment(), FirstLP.Load); - -  for (const auto &L : Loads) { -    Builder->SetInsertPoint(L.Load); -    Value *V = Builder->CreateExtractInteger( -        L.Load->getModule()->getDataLayout(), NewLoad, -        cast<IntegerType>(L.Load->getType()), -        (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract"); -    L.Load->replaceAllUsesWith(V); -  } - -  NumLoadsCombined += Loads.size(); -  return true; -} - -bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { -  if (skipBasicBlock(BB)) -    return false; - -  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); -  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - -  // Skip analysing dead blocks (not forward reachable from function entry). -  if (!DT->isReachableFromEntry(&BB)) { -    DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() << -          " in " << BB.getParent()->getName() << "\n"); -    return false; -  } - -  IRBuilder<TargetFolder> TheBuilder( -      BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); -  Builder = &TheBuilder; - -  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; -  AliasSetTracker AST(*AA); - -  bool Combined = false; -  unsigned Index = 0; -  for (auto &I : BB) { -    if (I.mayThrow() || AST.containsUnknown(&I)) { -      if (combineLoads(LoadMap)) -        Combined = true; -      LoadMap.clear(); -      AST.clear(); -      continue; -    } -    if (I.mayWriteToMemory()) { -      AST.add(&I); -      continue; -    } -    LoadInst *LI = dyn_cast<LoadInst>(&I); -    if (!LI) -      continue; -    ++NumLoadsAnalyzed; -    if (!LI->isSimple() || !LI->getType()->isIntegerTy()) -      continue; -    auto POP = getPointerOffsetPair(*LI); -    if (!POP.Pointer) -      continue; -    LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++}); -    AST.add(LI); -  } -  if (combineLoads(LoadMap)) -    Combined = true; -  return Combined; -} - -char LoadCombine::ID = 0; - -BasicBlockPass *llvm::createLoadCombinePass() { -  return new LoadCombine(); -} - -INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 3151ccd279c4..c41cc42db5e2 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -31,20 +31,19 @@ using namespace llvm;  STATISTIC(NumDeleted, "Number of loops deleted");  /// This function deletes dead loops. The caller of this function needs to -/// guarantee that the loop is infact dead.  Here we handle two kinds of dead +/// guarantee that the loop is infact dead. Here we handle two kinds of dead  /// loop. The first kind (\p isLoopDead) is where only invariant values from  /// within the loop are used outside of it. The second kind (\p  /// isLoopNeverExecuted) is where the loop is provably never executed. We can -/// always remove never executed loops since they will not cause any -/// difference to program behaviour. +/// always remove never executed loops since they will not cause any difference +/// to program behaviour.  ///   /// This also updates the relevant analysis information in \p DT, \p SE, and \p  /// LI. It also updates the loop PM if an updater struct is provided.  // TODO: This function will be used by loop-simplifyCFG as well. So, move this  // to LoopUtils.cpp  static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, -                           LoopInfo &LI, bool LoopIsNeverExecuted, -                           LPMUpdater *Updater = nullptr); +                           LoopInfo &LI, LPMUpdater *Updater = nullptr);  /// Determines if a loop is dead.  ///  /// This assumes that we've already checked for unique exit and exiting blocks, @@ -168,7 +167,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,    BasicBlock *ExitBlock = L->getUniqueExitBlock();    if (ExitBlock && isLoopNeverExecuted(L)) { -    deleteDeadLoop(L, DT, SE, LI, true /* LoopIsNeverExecuted */, Updater); +    // Set incoming value to undef for phi nodes in the exit block. +    BasicBlock::iterator BI = ExitBlock->begin(); +    while (PHINode *P = dyn_cast<PHINode>(BI)) { +      for (unsigned i = 0; i < P->getNumIncomingValues(); i++) +        P->setIncomingValue(i, UndefValue::get(P->getType())); +      BI++; +    } +    deleteDeadLoop(L, DT, SE, LI, Updater);      ++NumDeleted;      return true;    } @@ -196,15 +202,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,    if (isa<SCEVCouldNotCompute>(S))      return Changed; -  deleteDeadLoop(L, DT, SE, LI, false /* LoopIsNeverExecuted */, Updater); +  deleteDeadLoop(L, DT, SE, LI, Updater);    ++NumDeleted;    return true;  }  static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, -                           LoopInfo &LI, bool LoopIsNeverExecuted, -                           LPMUpdater *Updater) { +                           LoopInfo &LI, LPMUpdater *Updater) {    assert(L->isLCSSAForm(DT) && "Expected LCSSA!");    auto *Preheader = L->getLoopPreheader();    assert(Preheader && "Preheader should exist!"); @@ -227,6 +232,8 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,    auto *ExitBlock = L->getUniqueExitBlock();    assert(ExitBlock && "Should have a unique exit block!"); +  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!"); +    // Connect the preheader directly to the exit block.    // Even when the loop is never executed, we cannot remove the edge from the    // source block to the exit block. Consider the case where the unexecuted loop @@ -236,20 +243,28 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,    // non-loop, it will be deleted in a future iteration of loop deletion pass.    Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock); -  SmallVector<BasicBlock *, 4> ExitingBlocks; -  L->getExitingBlocks(ExitingBlocks);    // Rewrite phis in the exit block to get their inputs from the Preheader    // instead of the exiting block. -  BasicBlock *ExitingBlock = ExitingBlocks[0];    BasicBlock::iterator BI = ExitBlock->begin();    while (PHINode *P = dyn_cast<PHINode>(BI)) { -    int j = P->getBasicBlockIndex(ExitingBlock); -    assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); -    if (LoopIsNeverExecuted) -      P->setIncomingValue(j, UndefValue::get(P->getType())); -    P->setIncomingBlock(j, Preheader); -    for (unsigned i = 1; i < ExitingBlocks.size(); ++i) -      P->removeIncomingValue(ExitingBlocks[i]); +    // Set the zero'th element of Phi to be from the preheader and remove all +    // other incoming values. Given the loop has dedicated exits, all other +    // incoming values must be from the exiting blocks. +    int PredIndex = 0; +    P->setIncomingBlock(PredIndex, Preheader); +    // Removes all incoming values from all other exiting blocks (including +    // duplicate values from an exiting block). +    // Nuke all entries except the zero'th entry which is the preheader entry. +    // NOTE! We need to remove Incoming Values in the reverse order as done +    // below, to keep the indices valid for deletion (removeIncomingValues +    // updates getNumIncomingValues and shifts all values down into the operand +    // being deleted). +    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) +      P->removeIncomingValue(e-i, false); + +    assert((P->getNumIncomingValues() == 1 && +            P->getIncomingBlock(PredIndex) == Preheader) && +           "Should have exactly one value and that's from the preheader!");      ++BI;    } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index b027278b24f2..73436f13c94e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -131,7 +131,7 @@ static cl::opt<bool> EnablePhiElim(  // The flag adds instruction count to solutions cost comparision.  static cl::opt<bool> InsnsCost( -  "lsr-insns-cost", cl::Hidden, cl::init(true), +  "lsr-insns-cost", cl::Hidden, cl::init(false),    cl::desc("Add instruction count to a LSR cost model"));  // Flag to choose how to narrow complex lsr solution diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index cbbd55512c9f..7a7624f77542 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -1244,27 +1244,24 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {      // only do this for simple stores, we should expand to cover memcpys, etc.      const auto *LastStore = createStoreExpression(SI, StoreRHS);      const auto *LastCC = ExpressionToClass.lookup(LastStore); -    // Basically, check if the congruence class the store is in is defined by a -    // store that isn't us, and has the same value.  MemorySSA takes care of -    // ensuring the store has the same memory state as us already. -    // The RepStoredValue gets nulled if all the stores disappear in a class, so -    // we don't need to check if the class contains a store besides us. -    if (LastCC && -        LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand())) +    // We really want to check whether the expression we matched was a store. No +    // easy way to do that. However, we can check that the class we found has a +    // store, which, assuming the value numbering state is not corrupt, is +    // sufficient, because we must also be equivalent to that store's expression +    // for it to be in the same class as the load. +    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())        return LastStore; -    deleteExpression(LastStore);      // Also check if our value operand is defined by a load of the same memory      // location, and the memory state is the same as it was then (otherwise, it      // could have been overwritten later. See test32 in      // transforms/DeadStoreElimination/simple.ll). -    if (auto *LI = -            dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) { +    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))        if ((lookupOperandLeader(LI->getPointerOperand()) == -           lookupOperandLeader(SI->getPointerOperand())) && +           LastStore->getOperand(0)) &&            (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==             StoreRHS)) -        return createStoreExpression(SI, StoreRHS); -    } +        return LastStore; +    deleteExpression(LastStore);    }    // If the store is not equivalent to anything, value number it as a store that @@ -2332,9 +2329,7 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {  // see if we know some constant value for it already.  Value *NewGVN::findConditionEquivalence(Value *Cond) const {    auto Result = lookupOperandLeader(Cond); -  if (isa<Constant>(Result)) -    return Result; -  return nullptr; +  return isa<Constant>(Result) ? Result : nullptr;  }  // Process the outgoing edges of a block for reachability. @@ -3014,14 +3009,27 @@ void NewGVN::verifyIterationSettled(Function &F) {  // a no-longer valid StoreExpression.  void NewGVN::verifyStoreExpressions() const {  #ifndef NDEBUG -  DenseSet<std::pair<const Value *, const Value *>> StoreExpressionSet; +  // This is the only use of this, and it's not worth defining a complicated +  // densemapinfo hash/equality function for it. +  std::set< +      std::pair<const Value *, +                std::tuple<const Value *, const CongruenceClass *, Value *>>> +      StoreExpressionSet;    for (const auto &KV : ExpressionToClass) {      if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {        // Make sure a version that will conflict with loads is not already there -      auto Res = -          StoreExpressionSet.insert({SE->getOperand(0), SE->getMemoryLeader()}); -      assert(Res.second && -             "Stored expression conflict exists in expression table"); +      auto Res = StoreExpressionSet.insert( +          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second, +                                              SE->getStoredValue())}); +      bool Okay = Res.second; +      // It's okay to have the same expression already in there if it is +      // identical in nature. +      // This can happen when the leader of the stored value changes over time. +      if (!Okay) +        Okay = (std::get<1>(Res.first->second) == KV.second) && +               (lookupOperandLeader(std::get<2>(Res.first->second)) == +                lookupOperandLeader(SE->getStoredValue())); +      assert(Okay && "Stored expression conflict exists in expression table");        auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());        assert(ValueExpr && ValueExpr->equals(*SE) &&               "StoreExpression in ExpressionToClass is not latest " diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index a20890b22603..6da551bd7efd 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -35,6 +35,7 @@  #include "llvm/IR/IRBuilder.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h"  #include "llvm/IR/ValueHandle.h"  #include "llvm/Pass.h"  #include "llvm/Support/Debug.h" @@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) {              I->getOpcode() == Instruction::And)) {      Value *V0 = I->getOperand(0);      Value *V1 = I->getOperand(1); -    if (isa<ConstantInt>(V0)) +    const APInt *C; +    if (match(V0, PatternMatch::m_APInt(C)))        std::swap(V0, V1); -    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) { -      ConstPart = C->getValue(); +    if (match(V1, PatternMatch::m_APInt(C))) { +      ConstPart = *C;        SymbolicPart = V0;        isOr = (I->getOpcode() == Instruction::Or);        return; @@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) {    // view the operand as "V | 0"    SymbolicPart = V; -  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth()); +  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());    isOr = true;  } @@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {  /// Scan backwards and forwards among values with the same rank as element i  /// to see if X exists.  If X does not exist, return i.  This is useful when  /// scanning for 'x' when we see '-x' because they both get the same rank. -static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, -                                  Value *X) { +static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops, +                                  unsigned i, Value *X) {    unsigned XRank = Ops[i].Rank;    unsigned e = Ops.size();    for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { @@ -1134,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode,  /// instruction. There are two special cases: 1) if the constant operand is 0,  /// it will return NULL. 2) if the constant is ~0, the symbolic operand will  /// be returned. -static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,  +static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,                               const APInt &ConstOpnd) { -  if (ConstOpnd != 0) { -    if (!ConstOpnd.isAllOnesValue()) { -      LLVMContext &Ctx = Opnd->getType()->getContext(); -      Instruction *I; -      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd), -                                    "and.ra", InsertBefore); -      I->setDebugLoc(InsertBefore->getDebugLoc()); -      return I; -    } +  if (ConstOpnd.isNullValue()) +    return nullptr; + +  if (ConstOpnd.isAllOnesValue())      return Opnd; -  } -  return nullptr; + +  Instruction *I = BinaryOperator::CreateAnd( +      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra", +      InsertBefore); +  I->setDebugLoc(InsertBefore->getDebugLoc()); +  return I;  }  // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" @@ -1163,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,    //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)    //                       = (x & ~c1) ^ (c1 ^ c2)    // It is useful only when c1 == c2. -  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) { -    if (!Opnd1->getValue()->hasOneUse()) -      return false; +  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue()) +    return false; -    const APInt &C1 = Opnd1->getConstPart(); -    if (C1 != ConstOpnd) -      return false; +  if (!Opnd1->getValue()->hasOneUse()) +    return false; -    Value *X = Opnd1->getSymbolicPart(); -    Res = createAndInstr(I, X, ~C1); -    // ConstOpnd was C2, now C1 ^ C2. -    ConstOpnd ^= C1; +  const APInt &C1 = Opnd1->getConstPart(); +  if (C1 != ConstOpnd) +    return false; -    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) -      RedoInsts.insert(T); -    return true; -  } -  return false; +  Value *X = Opnd1->getSymbolicPart(); +  Res = createAndInstr(I, X, ~C1); +  // ConstOpnd was C2, now C1 ^ C2. +  ConstOpnd ^= C1; + +  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) +    RedoInsts.insert(T); +  return true;  } @@ -1221,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,      APInt C3((~C1) ^ C2);      // Do not increase code size! -    if (C3 != 0 && !C3.isAllOnesValue()) { -      int NewInstNum = ConstOpnd != 0 ? 1 : 2; +    if (!C3.isNullValue() && !C3.isAllOnesValue()) { +      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;        if (NewInstNum > DeadInstNum)          return false;      } @@ -1238,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,      APInt C3 = C1 ^ C2;      // Do not increase code size -    if (C3 != 0 && !C3.isAllOnesValue()) { -      int NewInstNum = ConstOpnd != 0 ? 1 : 2; +    if (!C3.isNullValue() && !C3.isAllOnesValue()) { +      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;        if (NewInstNum > DeadInstNum)          return false;      } @@ -1279,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,    SmallVector<XorOpnd, 8> Opnds;    SmallVector<XorOpnd*, 8> OpndPtrs;    Type *Ty = Ops[0].Op->getType(); -  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0); +  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);    // Step 1: Convert ValueEntry to XorOpnd    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {      Value *V = Ops[i].Op; -    if (!isa<ConstantInt>(V)) { +    const APInt *C; +    // TODO: Support non-splat vectors. +    if (match(V, PatternMatch::m_APInt(C))) { +      ConstOpnd ^= *C; +    } else {        XorOpnd O(V);        O.setSymbolicRank(getRank(O.getSymbolicPart()));        Opnds.push_back(O); -    } else -      ConstOpnd ^= cast<ConstantInt>(V)->getValue(); +    }    }    // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds". @@ -1327,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,      Value *CV;      // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" -    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { +    if (!ConstOpnd.isNullValue() && +        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {        Changed = true;        if (CV)          *CurrOpnd = XorOpnd(CV); @@ -1369,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,        ValueEntry VE(getRank(O.getValue()), O.getValue());        Ops.push_back(VE);      } -    if (ConstOpnd != 0) { -      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd); +    if (!ConstOpnd.isNullValue()) { +      Value *C = ConstantInt::get(Ty, ConstOpnd);        ValueEntry VE(getRank(C), C);        Ops.push_back(VE);      } -    int Sz = Ops.size(); +    unsigned Sz = Ops.size();      if (Sz == 1)        return Ops.back().Op; -    else if (Sz == 0) { -      assert(ConstOpnd == 0); -      return ConstantInt::get(Ty->getContext(), ConstOpnd); +    if (Sz == 0) { +      assert(ConstOpnd.isNullValue()); +      return ConstantInt::get(Ty, ConstOpnd);      }    } @@ -1627,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,  ///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]  ///  /// \returns Whether any factors have a power greater than one. -bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, -                                             SmallVectorImpl<Factor> &Factors) { +static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, +                                   SmallVectorImpl<Factor> &Factors) {    // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.    // Compute the sum of powers of simplifiable factors.    unsigned FactorPowerSum = 0; @@ -1999,11 +2004,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) {    if (I->isCommutative())      canonicalizeOperands(I); -  // TODO: We should optimize vector Xor instructions, but they are -  // currently unsupported. -  if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) -    return; -    // Don't optimize floating point instructions that don't have unsafe algebra.    if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())      return; diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index c6929c33b3e9..7a6fa1711411 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -536,9 +536,10 @@ private:    void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }    void visitFenceInst     (FenceInst &I) { /*returns void*/ }    void visitInstruction(Instruction &I) { -    // If a new instruction is added to LLVM that we don't handle. +    // All the instructions we don't do any special handling for just +    // go to overdefined.      DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); -    markOverdefined(&I);   // Just in case +    markOverdefined(&I);    }  }; @@ -1814,15 +1815,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,      if (F.isDeclaration())        continue; -    if (Solver.isBlockExecutable(&F.front())) { +    if (Solver.isBlockExecutable(&F.front()))        for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; -           ++AI) { -        if (AI->use_empty()) -          continue; -        if (tryToReplaceWithConstant(Solver, &*AI)) +           ++AI) +        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))            ++IPNumArgsElimed; -      } -    }      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {        if (!Solver.isBlockExecutable(&*BB)) { diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 1527f15f18a3..80fbbeb6829b 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1252,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {      if (!LI || !LI->isSimple())        return false; -    // Both operands to the select need to be dereferencable, either +    // Both operands to the select need to be dereferenceable, either      // absolutely (e.g. allocas) or at this point because we can see other      // accesses to it.      if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI)) @@ -1637,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {        return cast<PointerType>(NewTy)->getPointerAddressSpace() ==          cast<PointerType>(OldTy)->getPointerAddressSpace();      } -    if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) -      return true; + +    // We can convert integers to integral pointers, but not to non-integral +    // pointers. +    if (OldTy->isIntegerTy()) +      return !DL.isNonIntegralPointerType(NewTy); + +    // We can convert integral pointers to integers, but non-integral pointers +    // need to remain pointers. +    if (!DL.isNonIntegralPointerType(OldTy)) +      return NewTy->isIntegerTy(); +      return false;    } diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 850a01114eeb..ce6f93eb0c15 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -91,7 +91,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {    initializeSeparateConstOffsetFromGEPPass(Registry);    initializeSpeculativeExecutionLegacyPassPass(Registry);    initializeStraightLineStrengthReducePass(Registry); -  initializeLoadCombinePass(Registry);    initializePlaceBackedgeSafepointsImplPass(Registry);    initializePlaceSafepointsPass(Registry);    initializeFloat2IntLegacyPassPass(Registry); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 3e5993618c4c..9397b87cdf56 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -321,7 +321,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {  /// instruction from after the call to before the call, assuming that all  /// instructions between the call and this instruction are movable.  /// -static bool canMoveAboveCall(Instruction *I, CallInst *CI) { +static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {    // FIXME: We can move load/store/call/free instructions above the call if the    // call does not mod/ref the memory location being processed.    if (I->mayHaveSideEffects())  // This also handles volatile loads. @@ -332,10 +332,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) {      if (CI->mayHaveSideEffects()) {        // Non-volatile loads may be moved above a call with side effects if it        // does not write to memory and the load provably won't trap. -      // FIXME: Writes to memory only matter if they may alias the pointer +      // Writes to memory only matter if they may alias the pointer        // being loaded from.        const DataLayout &DL = L->getModule()->getDataLayout(); -      if (CI->mayWriteToMemory() || +      if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||            !isSafeToLoadUnconditionally(L->getPointerOperand(),                                         L->getAlignment(), DL, L))          return false; @@ -492,10 +492,11 @@ static CallInst *findTRECandidate(Instruction *TI,    return CI;  } -static bool -eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, -                           bool &TailCallsAreMarkedTail, -                           SmallVectorImpl<PHINode *> &ArgumentPHIs) { +static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, +                                       BasicBlock *&OldEntry, +                                       bool &TailCallsAreMarkedTail, +                                       SmallVectorImpl<PHINode *> &ArgumentPHIs, +                                       AliasAnalysis *AA) {    // If we are introducing accumulator recursion to eliminate operations after    // the call instruction that are both associative and commutative, the initial    // value for the accumulator is placed in this variable.  If this value is set @@ -515,7 +516,8 @@ eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,    // Check that this is the case now.    BasicBlock::iterator BBI(CI);    for (++BBI; &*BBI != Ret; ++BBI) { -    if (canMoveAboveCall(&*BBI, CI)) continue; +    if (canMoveAboveCall(&*BBI, CI, AA)) +      continue;      // If we can't move the instruction above the call, it might be because it      // is an associative and commutative operation that could be transformed @@ -674,12 +676,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,                                       bool &TailCallsAreMarkedTail,                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,                                       bool CannotTailCallElimCallsMarkedTail, -                                     const TargetTransformInfo *TTI) { +                                     const TargetTransformInfo *TTI, +                                     AliasAnalysis *AA) {    bool Change = false; +  // Make sure this block is a trivial return block. +  assert(BB->getFirstNonPHIOrDbg() == Ret && +         "Trying to fold non-trivial return block"); +    // If the return block contains nothing but the return and PHI's,    // there might be an opportunity to duplicate the return in its -  // predecessors and perform TRC there. Look for predecessors that end +  // predecessors and perform TRE there. Look for predecessors that end    // in unconditional branch and recursive call(s).    SmallVector<BranchInst*, 8> UncondBranchPreds;    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { @@ -706,7 +713,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,          BB->eraseFromParent();        eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, -                                 ArgumentPHIs); +                                 ArgumentPHIs, AA);        ++NumRetDuped;        Change = true;      } @@ -719,16 +726,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,                                    bool &TailCallsAreMarkedTail,                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,                                    bool CannotTailCallElimCallsMarkedTail, -                                  const TargetTransformInfo *TTI) { +                                  const TargetTransformInfo *TTI, +                                  AliasAnalysis *AA) {    CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);    if (!CI)      return false;    return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, -                                    ArgumentPHIs); +                                    ArgumentPHIs, AA);  } -static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) { +static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI, +                                   AliasAnalysis *AA) {    if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")      return false; @@ -763,11 +772,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI)      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {        bool Change =            processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, -                                ArgumentPHIs, !CanTRETailMarkedCall, TTI); +                                ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);        if (!Change && BB->getFirstNonPHIOrDbg() == Ret) -        Change = -            foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, -                                     ArgumentPHIs, !CanTRETailMarkedCall, TTI); +        Change = foldReturnAndProcessPred(BB, Ret, OldEntry, +                                          TailCallsAreMarkedTail, ArgumentPHIs, +                                          !CanTRETailMarkedCall, TTI, AA);        MadeChange |= Change;      }    } @@ -797,6 +806,7 @@ struct TailCallElim : public FunctionPass {    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addRequired<AAResultsWrapperPass>();      AU.addPreserved<GlobalsAAWrapperPass>();    } @@ -805,7 +815,8 @@ struct TailCallElim : public FunctionPass {        return false;      return eliminateTailRecursion( -        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F)); +        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), +        &getAnalysis<AAResultsWrapperPass>().getAAResults());    }  };  } @@ -826,8 +837,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,                                          FunctionAnalysisManager &AM) {    TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); +  AliasAnalysis &AA = AM.getResult<AAManager>(F); -  bool Changed = eliminateTailRecursion(F, &TTI); +  bool Changed = eliminateTailRecursion(F, &TTI, &AA);    if (!Changed)      return PreservedAnalyses::all(); diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index ebde1f9a17dd..b60dfb4f3541 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -116,6 +116,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {    case LibFunc_wcslen:      Changed |= setOnlyReadsMemory(F);      Changed |= setDoesNotThrow(F); +    Changed |= setOnlyAccessesArgMemory(F);      Changed |= setDoesNotCapture(F, 0);      return Changed;    case LibFunc_strchr: diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 2af671636cbd..5127eba3f9ae 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -26,6 +26,7 @@  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h" +#include "llvm/IR/ConstantRange.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DIBuilder.h"  #include "llvm/IR/DataLayout.h" @@ -1081,7 +1082,7 @@ static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,  }  /// See if there is a dbg.value intrinsic for DIVar for the PHI node. -static bool PhiHasDebugValue(DILocalVariable *DIVar,  +static bool PhiHasDebugValue(DILocalVariable *DIVar,                               DIExpression *DIExpr,                               PHINode *APN) {    // Since we can't guarantee that the original dbg.declare instrinsic @@ -1159,7 +1160,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,    DbgValue->insertAfter(LI);  } -/// Inserts a llvm.dbg.value intrinsic after a phi  +/// Inserts a llvm.dbg.value intrinsic after a phi  /// that has an associated llvm.dbg.decl intrinsic.  void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,                                             PHINode *APN, DIBuilder &Builder) { @@ -1742,12 +1743,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,          // Preserve !invariant.group in K.          break;        case LLVMContext::MD_align: -        K->setMetadata(Kind,  +        K->setMetadata(Kind,            MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));          break;        case LLVMContext::MD_dereferenceable:        case LLVMContext::MD_dereferenceable_or_null: -        K->setMetadata(Kind,  +        K->setMetadata(Kind,            MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));          break;      } @@ -1847,6 +1848,49 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {    return false;  } +void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N, +                               LoadInst &NewLI) { +  auto *NewTy = NewLI.getType(); + +  // This only directly applies if the new type is also a pointer. +  if (NewTy->isPointerTy()) { +    NewLI.setMetadata(LLVMContext::MD_nonnull, N); +    return; +  } + +  // The only other translation we can do is to integral loads with !range +  // metadata. +  if (!NewTy->isIntegerTy()) +    return; + +  MDBuilder MDB(NewLI.getContext()); +  const Value *Ptr = OldLI.getPointerOperand(); +  auto *ITy = cast<IntegerType>(NewTy); +  auto *NullInt = ConstantExpr::getPtrToInt( +      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); +  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); +  NewLI.setMetadata(LLVMContext::MD_range, +                    MDB.createRange(NonNullInt, NullInt)); +} + +void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, +                             MDNode *N, LoadInst &NewLI) { +  auto *NewTy = NewLI.getType(); + +  // Give up unless it is converted to a pointer where there is a single very +  // valuable mapping we can do reliably. +  // FIXME: It would be nice to propagate this in more ways, but the type +  // conversions make it hard. +  if (!NewTy->isPointerTy()) +    return; + +  unsigned BitWidth = DL.getTypeSizeInBits(NewTy); +  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { +    MDNode *NN = MDNode::get(OldLI.getContext(), None); +    NewLI.setMetadata(LLVMContext::MD_nonnull, NN); +  } +} +  namespace {  /// A potential constituent of a bitreverse or bswap expression. See  /// collectBitParts for a fuller explanation. @@ -1968,7 +2012,7 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,        unsigned NumMaskedBits = AndMask.countPopulation();        if (!MatchBitReversals && NumMaskedBits % 8 != 0)          return Result; -       +        auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,                                    MatchBitReversals, BPS);        if (!Res) diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index f3db278ef1e4..e21e34df8ded 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -72,7 +72,6 @@ using namespace llvm;  #define DEBUG_TYPE "loop-simplify" -STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");  STATISTIC(NumNested  , "Number of nested loops split out");  // If the block isn't already, move the new block to right after some 'outside @@ -152,37 +151,6 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,    return PreheaderBB;  } -/// \brief Ensure that the loop preheader dominates all exit blocks. -/// -/// This method is used to split exit blocks that have predecessors outside of -/// the loop. -static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, -                                        DominatorTree *DT, LoopInfo *LI, -                                        bool PreserveLCSSA) { -  SmallVector<BasicBlock*, 8> LoopBlocks; -  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { -    BasicBlock *P = *I; -    if (L->contains(P)) { -      // Don't do this if the loop is exited via an indirect branch. -      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr; - -      LoopBlocks.push_back(P); -    } -  } - -  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); -  BasicBlock *NewExitBB = nullptr; - -  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI, -                                     PreserveLCSSA); -  if (!NewExitBB) -    return nullptr; - -  DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " -               << NewExitBB->getName() << "\n"); -  return NewExitBB; -} -  /// Add the specified block, and all of its predecessors, to the specified set,  /// if it's not already in there.  Stop predecessor traversal when we reach  /// StopBlock. @@ -346,16 +314,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,    // Split edges to exit blocks from the inner loop, if they emerged in the    // process of separating the outer one. -  SmallVector<BasicBlock *, 8> ExitBlocks; -  L->getExitBlocks(ExitBlocks); -  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), -                                               ExitBlocks.end()); -  for (BasicBlock *ExitBlock : ExitBlockSet) { -    if (any_of(predecessors(ExitBlock), -               [L](BasicBlock *BB) { return !L->contains(BB); })) { -      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); -    } -  } +  formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);    if (PreserveLCSSA) {      // Fix LCSSA form for L. Some values, which previously were only used inside @@ -563,29 +522,16 @@ ReprocessLoop:    BasicBlock *Preheader = L->getLoopPreheader();    if (!Preheader) {      Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); -    if (Preheader) { -      ++NumInserted; +    if (Preheader)        Changed = true; -    }    }    // Next, check to make sure that all exit nodes of the loop only have    // predecessors that are inside of the loop.  This check guarantees that the    // loop preheader/header will dominate the exit blocks.  If the exit block has    // predecessors from outside of the loop, split the edge now. -  SmallVector<BasicBlock*, 8> ExitBlocks; -  L->getExitBlocks(ExitBlocks); - -  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), -                                               ExitBlocks.end()); -  for (BasicBlock *ExitBlock : ExitBlockSet) { -    if (any_of(predecessors(ExitBlock), -               [L](BasicBlock *BB) { return !L->contains(BB); })) { -      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); -      ++NumInserted; -      Changed = true; -    } -  } +  if (formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA)) +    Changed = true;    // If the header has more than two predecessors at this point (from the    // preheader and from multiple backedges), we must adjust the loop. @@ -614,10 +560,8 @@ ReprocessLoop:      // insert a new block that all backedges target, then make it jump to the      // loop header.      LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI); -    if (LoopLatch) { -      ++NumInserted; +    if (LoopLatch)        Changed = true; -    }    }    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); @@ -645,7 +589,22 @@ ReprocessLoop:    // loop-invariant instructions out of the way to open up more    // opportunities, and the disadvantage of having the responsibility    // to preserve dominator information. -  if (ExitBlockSet.size() == 1) { +  auto HasUniqueExitBlock = [&]() { +    BasicBlock *UniqueExit = nullptr; +    for (auto *ExitingBB : ExitingBlocks) +      for (auto *SuccBB : successors(ExitingBB)) { +        if (L->contains(SuccBB)) +          continue; + +        if (!UniqueExit) +          UniqueExit = SuccBB; +        else if (UniqueExit != SuccBB) +          return false; +      } + +    return true; +  }; +  if (HasUniqueExitBlock()) {      for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {        BasicBlock *ExitingBlock = ExitingBlocks[i];        if (!ExitingBlock->getSinglePredecessor()) continue; diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a920cd86a26a..5f85e17927fa 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -472,10 +472,22 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    // exit block only.    if (!L->isLoopSimplifyForm())      return false; -  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop -  if (!Exit) -    return false; +  // Guaranteed by LoopSimplifyForm. +  BasicBlock *Latch = L->getLoopLatch(); + +  BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop +  if (!LatchExit) +    return false; +  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the +  // targets of the Latch be the single exit block out of the loop. This needs +  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. +  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); +  assert((LatchBR->getSuccessor(0) == LatchExit || +          LatchBR->getSuccessor(1) == LatchExit) && +         "one of the loop latch successors should be " +         "the exit block!"); +  (void)LatchBR;    // Use Scalar Evolution to compute the trip count. This allows more loops to    // be unrolled than relying on induction var simplification.    if (!SE) @@ -510,25 +522,13 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    if (Log2_32(Count) > BEWidth)      return false; -  BasicBlock *Latch = L->getLoopLatch(); - -  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the -  // targets of the Latch be the single exit block out of the loop. This needs -  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. -  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); -  assert( -      (LatchBR->getSuccessor(0) == Exit || LatchBR->getSuccessor(1) == Exit) && -      "one of the loop latch successors should be " -      "the exit block!"); -  // Avoid warning of unused `LatchBR` variable in release builds. -  (void)LatchBR;    // Loop structure is the following:    //    // PreHeader    //   Header    //   ...    //   Latch -  // Exit +  // LatchExit    BasicBlock *NewPreHeader;    BasicBlock *NewExit = nullptr; @@ -541,9 +541,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,      // Split PreHeader to insert a branch around loop for unrolling.      NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);      NewPreHeader->setName(PreHeader->getName() + ".new"); -    // Split Exit to create phi nodes from branch above. -    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); -    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", +    // Split LatchExit to create phi nodes from branch above. +    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit)); +    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",                                       DT, LI, PreserveLCSSA);      // Split NewExit to insert epilog remainder loop.      EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI); @@ -570,7 +570,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    //   Latch             Header    // *NewExit            ...    // *EpilogPreHeader    Latch -  // Exit              Exit +  // LatchExit              LatchExit    // Calculate conditions for branch around loop for unrolling    // in epilog case and around prolog remainder loop in prolog case. @@ -648,7 +648,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    // Clone all the basic blocks in the loop. If Count is 2, we don't clone    // the loop, otherwise we create a cloned loop to execute the extra    // iterations. This function adds the appropriate CFG connections. -  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit; +  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;    BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;    CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,                    InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); @@ -672,7 +672,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    //   EpilogHeader      Header    //   ...               ...    //   EpilogLatch       Latch -  // Exit              Exit +  // LatchExit              LatchExit    // Rewrite the cloned instruction operands to use the values created when the    // clone is created. @@ -686,7 +686,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,    if (UseEpilogRemainder) {      // Connect the epilog code to the original loop and update the      // PHI functions. -    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader, +    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,                    EpilogPreHeader, NewPreHeader, VMap, DT, LI,                    PreserveLCSSA); diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 412f6129407e..0ed33945ef40 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -12,6 +12,7 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/ADT/ScopeExit.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/GlobalsModRef.h" @@ -29,6 +30,7 @@  #include "llvm/IR/ValueHandle.h"  #include "llvm/Pass.h"  #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h"  using namespace llvm;  using namespace llvm::PatternMatch; @@ -87,8 +89,7 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,    // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT    // with a new integer type of the corresponding bit width. -  if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)), -                           m_And(m_APInt(M), m_Instruction(I))))) { +  if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) {      int32_t Bits = (*M + 1).exactLogBase2();      if (Bits > 0) {        RT = IntegerType::get(Phi->getContext(), Bits); @@ -923,6 +924,69 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,    return true;  } +bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, +                                   bool PreserveLCSSA) { +  bool Changed = false; + +  // We re-use a vector for the in-loop predecesosrs. +  SmallVector<BasicBlock *, 4> InLoopPredecessors; + +  auto RewriteExit = [&](BasicBlock *BB) { +    assert(InLoopPredecessors.empty() && +           "Must start with an empty predecessors list!"); +    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); }); + +    // See if there are any non-loop predecessors of this exit block and +    // keep track of the in-loop predecessors. +    bool IsDedicatedExit = true; +    for (auto *PredBB : predecessors(BB)) +      if (L->contains(PredBB)) { +        if (isa<IndirectBrInst>(PredBB->getTerminator())) +          // We cannot rewrite exiting edges from an indirectbr. +          return false; + +        InLoopPredecessors.push_back(PredBB); +      } else { +        IsDedicatedExit = false; +      } + +    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!"); + +    // Nothing to do if this is already a dedicated exit. +    if (IsDedicatedExit) +      return false; + +    auto *NewExitBB = SplitBlockPredecessors( +        BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA); + +    if (!NewExitBB) +      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: " +                   << *L << "\n"); +    else +      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " +                   << NewExitBB->getName() << "\n"); +    return true; +  }; + +  // Walk the exit blocks directly rather than building up a data structure for +  // them, but only visit each one once. +  SmallPtrSet<BasicBlock *, 4> Visited; +  for (auto *BB : L->blocks()) +    for (auto *SuccBB : successors(BB)) { +      // We're looking for exit blocks so skip in-loop successors. +      if (L->contains(SuccBB)) +        continue; + +      // Visit each exit block exactly once. +      if (!Visited.insert(SuccBB).second) +        continue; + +      Changed |= RewriteExit(SuccBB); +    } + +  return Changed; +} +  /// \brief Returns the instructions that use values defined in the loop.  SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {    SmallVector<Instruction *, 8> UsedOutside; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1abdb2484850..eac2867233bc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5702,14 +5702,14 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,  void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {    // We should not collect Uniforms more than once per VF. Right now, -  // this function is called from collectUniformsAndScalars(), which  +  // this function is called from collectUniformsAndScalars(), which    // already does this check. Collecting Uniforms for VF=1 does not make any    // sense.    assert(VF >= 2 && !Uniforms.count(VF) &&           "This function should not be visited twice for the same VF"); -  // Visit the list of Uniforms. If we'll not find any uniform value, we'll  +  // Visit the list of Uniforms. If we'll not find any uniform value, we'll    // not analyze again.  Uniforms.count(VF) will return 1.    Uniforms[VF].clear(); @@ -5988,10 +5988,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses(          continue;        Value *Ptr = getPointerOperand(&I); -      // We don't check wrapping here because we don't know yet if Ptr will be  -      // part of a full group or a group with gaps. Checking wrapping for all  +      // We don't check wrapping here because we don't know yet if Ptr will be +      // part of a full group or a group with gaps. Checking wrapping for all        // pointers (even those that end up in groups with no gaps) will be overly -      // conservative. For full groups, wrapping should be ok since if we would  +      // conservative. For full groups, wrapping should be ok since if we would        // wrap around the address space we would do a memory access at nullptr        // even without the transformation. The wrapping checks are therefore        // deferred until after we've formed the interleaved groups. @@ -6244,7 +6244,7 @@ void InterleavedAccessInfo::analyzeInterleaving(      Instruction *LastMember = Group->getMember(Group->getFactor() - 1);      if (LastMember) {        Value *LastMemberPtr = getPointerOperand(LastMember); -      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,  +      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,                          /*ShouldCheckWrap=*/true)) {          DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "                          "last group member potentially pointer-wrapping.\n"); @@ -6252,9 +6252,9 @@ void InterleavedAccessInfo::analyzeInterleaving(        }      } else {        // Case 3: A non-reversed interleaved load group with gaps: We need -      // to execute at least one scalar epilogue iteration. This will ensure  +      // to execute at least one scalar epilogue iteration. This will ensure        // we don't speculatively access memory out-of-bounds. We only need -      // to look for a member at index factor - 1, since every group must have  +      // to look for a member at index factor - 1, since every group must have        // a member at index zero.        if (Group->isReverse()) {          releaseGroup(Group); @@ -7789,8 +7789,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {    // Check the loop for a trip count threshold:    // do not vectorize loops with a tiny trip count. -  const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L); -  if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) { +  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); +  bool HasExpectedTC = (ExpectedTC > 0); + +  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { +    auto EstimatedTC = getLoopEstimatedTripCount(L); +    if (EstimatedTC) { +      ExpectedTC = *EstimatedTC; +      HasExpectedTC = true; +    } +  } + +  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "                   << "This loop is not worth vectorizing.");      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) @@ -7822,18 +7832,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {    bool OptForSize =        Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); -  // Compute the weighted frequency of this loop being executed and see if it -  // is less than 20% of the function entry baseline frequency. Note that we -  // always have a canonical loop here because we think we *can* vectorize. -  // FIXME: This is hidden behind a flag due to pervasive problems with -  // exactly what block frequency models. -  if (LoopVectorizeWithBlockFrequency) { -    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); -    if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && -        LoopEntryFreq < ColdEntryFreq) -      OptForSize = true; -  } -    // Check the function attributes to see if implicit floats are allowed.    // FIXME: This check doesn't seem possibly correct -- what if the loop is    // an integer loop and the vector instructions selected are purely integer @@ -8015,11 +8013,6 @@ bool LoopVectorizePass::runImpl(    DB = &DB_;    ORE = &ORE_; -  // Compute some weights outside of the loop over the loops. Compute this -  // using a BranchProbability to re-use its scaling math. -  const BranchProbability ColdProb(1, 5); // 20% -  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; -    // Don't attempt if    // 1. the target claims to have no vector registers, and    // 2. interleaving won't help ILP. diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index d1349535f298..b267230d3185 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3327,12 +3327,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {                BundleMember->Dependencies++;                ScheduleData *DestBundle = UseSD->FirstInBundle; -              if (!DestBundle->IsScheduled) { +              if (!DestBundle->IsScheduled)                  BundleMember->incrementUnscheduledDeps(1); -              } -              if (!DestBundle->hasValidDependencies()) { +              if (!DestBundle->hasValidDependencies())                  WorkList.push_back(DestBundle); -              }              }            } else {              // I'm not sure if this can ever happen. But we need to be safe.  | 
