17 files changed, 593 insertions, 347 deletions
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 683655f1f68b..a9cfd8ded6fb 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -710,7 +710,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
 
 // Check if there is PGO data or user annoated branch data:
 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
-  if (F->getEntryCount())
+  if (F->hasProfileData())
     return true;
   // Now check if any of the entry block has MD_prof data:
   for (auto *E : OI->Entries) {
@@ -863,6 +863,7 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
     case Instruction::GetElementPtr:
       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
         continue;
+      break;
     default:
       break;
     }
@@ -1273,7 +1274,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
 
   // Only try to outline cold regions if we have a profile summary, which
   // implies we have profiling information.
-  if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() &&
+  if (PSI->hasProfileSummary() && F->hasProfileData() &&
       !DisableMultiRegionPartialInline) {
     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
         computeOutliningColdRegionsInfo(F);
@@ -1379,10 +1380,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
                             Cloner.ClonedFunc->user_end());
 
   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
-  if (Cloner.OrigFunc->getEntryCount())
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+  if (CalleeEntryCount)
     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
-  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
 
   bool AnyInline = false;
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index f0e781b9d923..7086c2eb52c4 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1583,7 +1583,10 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-  F.setEntryCount(0);
+  // Initialize the entry count to -1, which will be treated conservatively
+  // by getEntryCount as the same as unknown (None). If we have samples this
+  // will be overwritten in emitAnnotations.
+  F.setEntryCount(-1);
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
     auto &FAM =
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index ec56f0cde25d..5fbb001216a3 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1346,6 +1346,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
       Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
                                      ResByArg.Bit);
       applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+      break;
     }
     default:
       break;
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index aa055121e710..a088d447337f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4394,6 +4394,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             cast<CallInst>(Caller)->getCallingConv());
         cast<CallInst>(NewCaller)->setAttributes(NewPAL);
       }
+      NewCaller->setDebugLoc(Caller->getDebugLoc());
 
       return NewCaller;
     }
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2a25423e04bd..8e2833d22032 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -80,6 +80,11 @@ static cl::opt<bool> ClInstrumentAtomics(
     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
     cl::init(true));
 
+static cl::opt<bool> ClRecover(
+    "hwasan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of addressability bugs
@@ -89,7 +94,8 @@ public:
   // Pass identification, replacement for typeid.
   static char ID;
 
-  HWAddressSanitizer() : FunctionPass(ID) {}
+  HWAddressSanitizer(bool Recover = false)
+      : FunctionPass(ID), Recover(Recover || ClRecover) {}
 
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
@@ -109,6 +115,8 @@ private:
   LLVMContext *C;
   Type *IntptrTy;
 
+  bool Recover;
+
   Function *HwasanCtorFunction;
 
   Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
@@ -126,8 +134,8 @@ INITIALIZE_PASS_END(
     HWAddressSanitizer, "hwasan",
     "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
 
-FunctionPass *llvm::createHWAddressSanitizerPass() {
-  return new HWAddressSanitizer();
+FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) {
+  return new HWAddressSanitizer(Recover);
 }
 
 /// \brief Module-level initialization.
@@ -156,10 +164,11 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
     const std::string TypeStr = AccessIsWrite ? "store" : "load";
+    const std::string EndingStr = Recover ? "_noabort" : "";
 
     HwasanMemoryAccessCallbackSized[AccessIsWrite] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            ClMemoryAccessCallbackPrefix + TypeStr,
+            ClMemoryAccessCallbackPrefix + TypeStr + EndingStr,
             FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)));
 
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -167,7 +176,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
       HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
           checkSanitizerInterfaceFunction(M.getOrInsertFunction(
               ClMemoryAccessCallbackPrefix + TypeStr +
-                  itostr(1ULL << AccessSizeIndex),
+                  itostr(1ULL << AccessSizeIndex) + EndingStr,
               FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));
     }
   }
@@ -246,14 +255,16 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
   Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
 
   TerminatorInst *CheckTerm =
-      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
   IRB.SetInsertPoint(CheckTerm);
   // The signal handler will find the data address in x0.
   InlineAsm *Asm = InlineAsm::get(
       FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
-      "hlt #" + itostr(0x100 + IsWrite * 0x10 + AccessSizeIndex), "{x0}",
+      "hlt #" +
+          itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex),
+      "{x0}",
       /*hasSideEffects=*/true);
   IRB.CreateCall(Asm, PtrLong);
 }
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index d8c408035038..207243231aad 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -13,10 +13,11 @@
 // threading, or IPA-CP based function cloning, etc.).
 // As of now we support two cases :
 //
-// 1) If a call site is dominated by an OR condition and if any of its arguments
-// are predicated on this OR condition, try to split the condition with more
-// constrained arguments. For example, in the code below, we try to split the
-// call site since we can predicate the argument(ptr) based on the OR condition.
+// 1) Try to a split call-site with constrained arguments, if any constraints
+// on any argument can be found by following the single predecessors of the
+// all site's predecessors. Currently this pass only handles call-sites with 2
+// predecessors. For example, in the code below, we try to split the call-site
+// since we can predicate the argument(ptr) based on the OR condition.
 //
 // Split from :
 //   if (!ptr || c)
@@ -200,16 +201,15 @@ static bool canSplitCallSite(CallSite CS) {
 }
 
 /// Return true if the CS is split into its new predecessors which are directly
-/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
-/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point
-/// to the second compare block. CallInst1 and CallInst2 will be the new
-/// call-sites placed in the new predecessors split for PredBB1 and PredBB2,
-/// repectively. Therefore, CallInst1 will be the call-site placed
-/// between Header and Tail, and CallInst2 will be the call-site between TBB and
-/// Tail. For example, in the IR below with an OR condition, the call-site can
-/// be split
+/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
+/// CallInst1 and CallInst2 will be the new call-sites placed in the new
+/// predecessors split for PredBB1 and PredBB2, respectively.
+/// For example, in the IR below with an OR condition, the call-site can
+/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
+/// call-site placed between Header and Tail, and CallInst2 will be the
+/// call-site between TBB and Tail.
 ///
-/// from :
+/// From :
 ///
 ///   Header:
 ///     %c = icmp eq i32* %a, null
@@ -237,9 +237,9 @@ static bool canSplitCallSite(CallSite CS) {
 ///   Tail:
 ///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
 ///
-/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
-/// created with more constrained arguments in
-/// createCallSitesOnOrPredicatedArgument().
+/// Note that in case any arguments at the call-site are constrained by its
+/// predecessors, new call-sites with more constrained arguments will be
+/// created in createCallSitesOnPredicatedArgument().
 static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
                           Instruction *CallInst1, Instruction *CallInst2) {
   Instruction *Instr = CS.getInstruction();
@@ -332,18 +332,10 @@ static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
   splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
   return true;
 }
-// Check if one of the predecessors is a single predecessors of the other.
-// This is a requirement for control flow modeling an OR. HeaderBB points to
-// the single predecessor and OrBB points to other node. HeaderBB potentially
-// contains the first compare of the OR and OrBB the second.
-static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) {
-  return OrBB->getSinglePredecessor() == HeaderBB &&
-         HeaderBB->getTerminator()->getNumSuccessors() == 2;
-}
 
-static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPredicatedArgument(CallSite CS) {
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
-  if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0]))
+  if (Preds[0] == Preds[1])
     return false;
 
   SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
@@ -362,7 +354,7 @@ static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
 static bool tryToSplitCallSite(CallSite CS) {
   if (!CS.arg_size() || !canSplitCallSite(CS))
     return false;
-  return tryToSplitOnOrPredicatedArgument(CS) ||
+  return tryToSplitOnPredicatedArgument(CS) ||
          tryToSplitOnPHIPredicatedArgument(CS);
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 6b0377e0ecb3..1476f7850cf0 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -282,7 +282,7 @@ bool JumpThreading::runOnFunction(Function &F) {
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
-  bool HasProfileData = F.getEntryCount().hasValue();
+  bool HasProfileData = F.hasProfileData();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
     BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
@@ -307,8 +307,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
-  bool HasProfileData = F.getEntryCount().hasValue();
-  if (HasProfileData) {
+  if (F.hasProfileData()) {
     LoopInfo LI{DominatorTree(F)};
     BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
@@ -1333,6 +1332,20 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // code size.
   BasicBlock *UnavailablePred = nullptr;
 
+  // If the value is unavailable in one of predecessors, we will end up
+  // inserting a new instruction into them. It is only valid if all the
+  // instructions before LI are guaranteed to pass execution to its successor,
+  // or if LI is safe to speculate.
+  // TODO: If this logic becomes more complex, and we will perform PRE insertion
+  // farther than to a predecessor, we need to reuse the code from GVN's PRE.
+  // It requires domination tree analysis, so for this simple case it is an
+  // overkill.
+  if (PredsScanned.size() != AvailablePreds.size() &&
+      !isSafeToSpeculativelyExecute(LI))
+    for (auto I = LoadBB->begin(); &*I != LI; ++I)
+      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+        return false;
+
   // If there is exactly one predecessor where the value is unavailable, the
   // already computed 'OneUnavailablePred' block is it.  If it ends in an
   // unconditional branch, we know that it isn't a critical edge.
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index c9d55b4594fe..430a7085d93f 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -247,7 +247,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
 
   // Enable LoopSink only when runtime profile is available.
   // With static profile, the sinking decision may be sub-optimal.
-  if (!Preheader->getParent()->getEntryCount())
+  if (!Preheader->getParent()->hasProfileData())
     return false;
 
   const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 7b1d6446a24a..15e7da5e1a7a 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -882,7 +882,7 @@ static bool computeUnrollCount(
   }
   
   // Check if the runtime trip count is too small when profile is available.
-  if (L->getHeader()->getParent()->getEntryCount()) {
+  if (L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
       if (*ProfileTripCount < FlatLoopTripCountThreshold)
         return false;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b42a747..6af3fef963dc 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -476,22 +476,33 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       Alignment = DL.getABITypeAlignment(EltType);
     }
 
-    AMemSet =
-      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    // Remember the debug location.
+    DebugLoc Loc;
+    if (!Range.TheStores.empty())
+      Loc = Range.TheStores[0]->getDebugLoc();
 
     DEBUG(dbgs() << "Replace stores:\n";
           for (Instruction *SI : Range.TheStores)
-            dbgs() << *SI << '\n';
-          dbgs() << "With: " << *AMemSet << '\n');
-
-    if (!Range.TheStores.empty())
-      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+            dbgs() << *SI << '\n');
 
     // Zap all the stores.
     for (Instruction *SI : Range.TheStores) {
       MD->removeInstruction(SI);
       SI->eraseFromParent();
     }
+
+    // Create the memset after removing the stores, so that if there any cached
+    // non-local dependencies on the removed instructions in
+    // MemoryDependenceAnalysis, the cache entries are updated to "dirty"
+    // entries pointing below the memset, so subsequent queries include the
+    // memset.
+    AMemSet =
+      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Loc);
+
+    DEBUG(dbgs() << "With: " << *AMemSet << '\n');
+
     ++NumMemSetInfer;
   }
 
@@ -1031,9 +1042,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   //
   // NOTE: This is conservative, it will stop on any read from the source loc,
   // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   M->getIterator(), M->getParent());
+  MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep);
+  MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false,
+                                                        M->getIterator(), M->getParent());
+
+  if (SourceDep.isNonLocal()) {
+    SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+    MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false,
+                                         NonLocalDepResults);
+    if (NonLocalDepResults.size() == 1) {
+      SourceDep = NonLocalDepResults[0].getResult();
+      assert((!SourceDep.getInst() ||
+              LookupDomTree().dominates(SourceDep.getInst(), M)) &&
+             "when memdep returns exactly one result, it should dominate");
+    }
+  }
+
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
 
@@ -1235,6 +1259,18 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
   MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
       SrcLoc, true, M->getIterator(), M->getParent());
 
+  if (SrcDepInfo.isNonLocal()) {
+    SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+    MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true,
+                                         NonLocalDepResults);
+    if (NonLocalDepResults.size() == 1) {
+      SrcDepInfo = NonLocalDepResults[0].getResult();
+      assert((!SrcDepInfo.getInst() ||
+              LookupDomTree().dominates(SrcDepInfo.getInst(), M)) &&
+             "when memdep returns exactly one result, it should dominate");
+    }
+  }
+
   if (SrcDepInfo.isClobber()) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep);
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index e5866b4718da..66608ec631f6 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1929,9 +1929,32 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        assert(Folded &&
-              "Expect TermInst on constantint or blockaddress to be folded");
-        (void) Folded;
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (auto *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            llvm_unreachable("Didn't fold away reference to block!");
+          }
+#endif
+
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
       }
 
       // Finally, delete the basic block.
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 209821ff21d7..8fa9ffb6d014 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -97,7 +97,7 @@
 //    load %p2
 //    ...
 //
-// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// We can not do CSE to the common part related to index "i64 %i". Lowering
 // GEPs can achieve such goals.
 // If the target does not use alias analysis in codegen, this pass will
 // lower a GEP with multiple indices into arithmetic operations:
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index eb3139ce4293..8825f77555e7 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -23,10 +23,30 @@ using namespace llvm;
 /// Fix-up phi nodes in an invoke instruction's normal destination.
 ///
 /// After versioning an invoke instruction, values coming from the original
-/// block will now either be coming from the original block or the "else" block.
+/// block will now be coming from the "merge" block. For example, in the code
+/// below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ]
+///     br %normal_dst
+///
+///   normal_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in
+/// "normal_dst" must be fixed to refer to "merge_bb":
+///
+///    normal_dst:
+///      %t3 = phi i32 [ %x, %merge_bb ], ...
+///
 static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
-                                      BasicBlock *ElseBlock,
-                                      Instruction *NewInst) {
+                                      BasicBlock *MergeBlock) {
   for (auto &I : *Invoke->getNormalDest()) {
     auto *Phi = dyn_cast<PHINode>(&I);
     if (!Phi)
@@ -34,13 +54,7 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
     int Idx = Phi->getBasicBlockIndex(OrigBlock);
     if (Idx == -1)
       continue;
-    Value *V = Phi->getIncomingValue(Idx);
-    if (dyn_cast<Instruction>(V) == Invoke) {
-      Phi->setIncomingBlock(Idx, ElseBlock);
-      Phi->addIncoming(NewInst, OrigBlock);
-      continue;
-    }
-    Phi->addIncoming(V, ElseBlock);
+    Phi->setIncomingBlock(Idx, MergeBlock);
   }
 }
 
@@ -48,6 +62,23 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
 ///
 /// After versioning an invoke instruction, values coming from the original
 /// block will now be coming from either the "then" block or the "else" block.
+/// For example, in the code below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in
+/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb":
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ...
+///
 static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
                                       BasicBlock *ThenBlock,
                                       BasicBlock *ElseBlock) {
@@ -64,44 +95,26 @@ static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
   }
 }
 
-/// Get the phi node having the returned value of a call or invoke instruction
-/// as it's operand.
-static bool getRetPhiNode(Instruction *Inst, BasicBlock *Block) {
-  BasicBlock *FromBlock = Inst->getParent();
-  for (auto &I : *Block) {
-    PHINode *PHI = dyn_cast<PHINode>(&I);
-    if (!PHI)
-      break;
-    int Idx = PHI->getBasicBlockIndex(FromBlock);
-    if (Idx == -1)
-      continue;
-    auto *V = PHI->getIncomingValue(Idx);
-    if (V == Inst)
-      return true;
-  }
-  return false;
-}
-
 /// Create a phi node for the returned value of a call or invoke instruction.
 ///
 /// After versioning a call or invoke instruction that returns a value, we have
 /// to merge the value of the original and new instructions. We do this by
 /// creating a phi node and replacing uses of the original instruction with this
 /// phi node.
-static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
+///
+/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is
+/// defined in "then_bb", we create the following phi node:
+///
+///   ; Uses of the original instruction are replaced by uses of the phi node.
+///   %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ],
+///
+static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
+                             BasicBlock *MergeBlock, IRBuilder<> &Builder) {
 
   if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())
     return;
 
-  BasicBlock *RetValBB = NewInst->getParent();
-  if (auto *Invoke = dyn_cast<InvokeInst>(NewInst))
-    RetValBB = Invoke->getNormalDest();
-  BasicBlock *PhiBB = RetValBB->getSingleSuccessor();
-
-  if (getRetPhiNode(OrigInst, PhiBB))
-    return;
-
-  IRBuilder<> Builder(&PhiBB->front());
+  Builder.SetInsertPoint(&MergeBlock->front());
   PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
   SmallVector<User *, 16> UsersToUpdate;
   for (User *U : OrigInst->users())
@@ -109,7 +122,7 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
   for (User *U : UsersToUpdate)
     U->replaceUsesOfWith(OrigInst, Phi);
   Phi->addIncoming(OrigInst, OrigInst->getParent());
-  Phi->addIncoming(NewInst, RetValBB);
+  Phi->addIncoming(NewInst, NewInst->getParent());
 }
 
 /// Cast a call or invoke instruction to the given type.
@@ -118,7 +131,41 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
 /// that of the callee. If this is the case, we have to cast the returned value
 /// to the correct type. The location of the cast depends on if we have a call
 /// or invoke instruction.
-Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
+///
+/// For example, if the call instruction below requires a bitcast after
+/// promotion:
+///
+///   orig_bb:
+///     %t0 = call i32 @func()
+///     ...
+///
+/// The bitcast is placed after the call instruction:
+///
+///   orig_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t0 = call i32 @func()
+///     %t1 = bitcast i32 %t0 to ...
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, a new block is created for the bitcast. For
+/// example, if the invoke instruction below requires a bitcast after promotion:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst
+///
+/// The edge between the original block and the invoke's normal destination is
+/// split, and the bitcast is placed there:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst
+///
+///   split_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t1 = bitcast i32 %t0 to ...
+///     br label %normal_dst
+///
+static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
 
   // Save the users of the calling instruction. These uses will be changed to
   // use the bitcast after we create it.
@@ -130,19 +177,20 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
   // value. The location depends on if we have a call or invoke instruction.
   Instruction *InsertBefore = nullptr;
   if (auto *Invoke = dyn_cast<InvokeInst>(CS.getInstruction()))
-    InsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt();
+    InsertBefore =
+        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
   else
     InsertBefore = &*std::next(CS.getInstruction()->getIterator());
 
   // Bitcast the return value to the correct type.
   auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(),
                                 RetTy, "", InsertBefore);
+  if (RetBitCast)
+    *RetBitCast = Cast;
 
   // Replace all the original uses of the calling instruction with the bitcast.
   for (User *U : UsersToUpdate)
     U->replaceUsesOfWith(CS.getInstruction(), Cast);
-
-  return Cast;
 }
 
 /// Predicate and clone the given call site.
@@ -152,21 +200,78 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
 /// callee. The original call site is moved into the "else" block, and a clone
 /// of the call site is placed in the "then" block. The cloned instruction is
 /// returned.
+///
+/// For example, the call instruction below:
+///
+///   orig_bb:
+///     %t0 = call i32 %ptr()
+///     ...
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original call instruction is placed in the "then"
+///     ; block. It is not yet promoted.
+///     %t1 = call i32 %ptr()
+///     br merge_bb
+///
+///   else_bb:
+///     ; The original call instruction is moved to the "else" block.
+///     %t0 = call i32 %ptr()
+///     br merge_bb
+///
+///   merge_bb:
+///     ; Uses of the original call instruction are replaced by uses of the phi
+///     ; node.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, more work is required. For example, the
+/// invoke instruction below:
+///
+///   orig_bb:
+///     %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original invoke instruction is placed in the "then"
+///     ; block, and its normal destination is set to the "merge" block. It is
+///     ; not yet promoted.
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     ; The original invoke instruction is moved into the "else" block, and
+///     ; its normal destination is set to the "merge" block.
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     ; Uses of the original invoke instruction are replaced by uses of the
+///     ; phi node, and the merge block branches to the normal destination.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     br %normal_dst
+///
 static Instruction *versionCallSite(CallSite CS, Value *Callee,
-                                    MDNode *BranchWeights,
-                                    BasicBlock *&ThenBlock,
-                                    BasicBlock *&ElseBlock,
-                                    BasicBlock *&MergeBlock) {
+                                    MDNode *BranchWeights) {
 
   IRBuilder<> Builder(CS.getInstruction());
   Instruction *OrigInst = CS.getInstruction();
+  BasicBlock *OrigBlock = OrigInst->getParent();
 
   // Create the compare. The called value and callee must have the same type to
   // be compared.
-  auto *LHS =
-      Builder.CreateBitCast(CS.getCalledValue(), Builder.getInt8PtrTy());
-  auto *RHS = Builder.CreateBitCast(Callee, Builder.getInt8PtrTy());
-  auto *Cond = Builder.CreateICmpEQ(LHS, RHS);
+  if (CS.getCalledValue()->getType() != Callee->getType())
+    Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType());
+  auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee);
 
   // Create an if-then-else structure. The original instruction is moved into
   // the "else" block, and a clone of the original instruction is placed in the
@@ -175,9 +280,9 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   TerminatorInst *ElseTerm = nullptr;
   SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,
                                 BranchWeights);
-  ThenBlock = ThenTerm->getParent();
-  ElseBlock = ElseTerm->getParent();
-  MergeBlock = OrigInst->getParent();
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  BasicBlock *ElseBlock = ElseTerm->getParent();
+  BasicBlock *MergeBlock = OrigInst->getParent();
 
   ThenBlock->setName("if.true.direct_targ");
   ElseBlock->setName("if.false.orig_indirect");
@@ -188,7 +293,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   NewInst->insertBefore(ThenTerm);
 
   // If the original call site is an invoke instruction, we have extra work to
-  // do since invoke instructions are terminating.
+  // do since invoke instructions are terminating. We have to fix-up phi nodes
+  // in the invoke's normal and unwind destinations.
   if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {
     auto *NewInvoke = cast<InvokeInst>(NewInst);
 
@@ -201,11 +307,19 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
     Builder.SetInsertPoint(MergeBlock);
     Builder.CreateBr(OrigInvoke->getNormalDest());
 
-    // Now set the normal destination of new the invoke instruction to be the
+    // Fix-up phi nodes in the original invoke's normal and unwind destinations.
+    fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock);
+    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
+
+    // Now set the normal destinations of the invoke instructions to be the
     // "merge" block.
+    OrigInvoke->setNormalDest(MergeBlock);
     NewInvoke->setNormalDest(MergeBlock);
   }
 
+  // Create a phi node for the returned value of the call site.
+  createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
+
   return NewInst;
 }
 
@@ -253,7 +367,8 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
   return true;
 }
 
-static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
+Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
+                               CastInst **RetBitCast) {
   assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
 
   // Set the called function of the call site to be the given callee.
@@ -268,7 +383,7 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
   // If the function type of the call site matches that of the callee, no
   // additional work is required.
   if (CS.getFunctionType() == Callee->getFunctionType())
-    return;
+    return CS.getInstruction();
 
   // Save the return types of the call site and callee.
   Type *CallSiteRetTy = CS.getInstruction()->getType();
@@ -294,7 +409,9 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
   // If the return type of the call site doesn't match that of the callee, cast
   // the returned value to the appropriate type.
   if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy)
-    Cast = createRetBitCast(CS, CallSiteRetTy);
+    createRetBitCast(CS, CallSiteRetTy, RetBitCast);
+
+  return CS.getInstruction();
 }
 
 Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,
@@ -303,26 +420,10 @@ Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,
   // Version the indirect call site. If the called value is equal to the given
   // callee, 'NewInst' will be executed, otherwise the original call site will
   // be executed.
-  BasicBlock *ThenBlock, *ElseBlock, *MergeBlock;
-  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights, ThenBlock,
-                                         ElseBlock, MergeBlock);
+  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights);
 
   // Promote 'NewInst' so that it directly calls the desired function.
-  Instruction *Cast = NewInst;
-  promoteCall(CallSite(NewInst), Callee, Cast);
-
-  // If the original call site is an invoke instruction, we have to fix-up phi
-  // nodes in the invoke's normal and unwind destinations.
-  if (auto *OrigInvoke = dyn_cast<InvokeInst>(CS.getInstruction())) {
-    fixupPHINodeForNormalDest(OrigInvoke, MergeBlock, ElseBlock, Cast);
-    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
-  }
-
-  // Create a phi node for the returned value of the call site.
-  createRetPHINode(CS.getInstruction(), Cast ? Cast : NewInst);
-
-  // Return the new direct call.
-  return NewInst;
+  return promoteCall(CallSite(NewInst), Callee);
 }
 
 #undef DEBUG_TYPE
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 4273ce0b6200..c84ae7d693d7 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -203,7 +203,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) {
+  if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {
     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
     if (!PeelCount)
       return;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index f02f80cc1b78..b3c80424c8b9 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationDepth(
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<unsigned> DependenceChainLatency(
+    "dependence-chain-latency", cl::Hidden, cl::init(8),
+    cl::desc("Limit the maximum latency of dependence chain containing cmp "
+             "for if conversion"));
+
+static cl::opt<unsigned> SmallBBSize(
+    "small-bb-size", cl::Hidden, cl::init(40),
+    cl::desc("Check dependence chain latency only in basic block smaller than "
+             "this number"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   return true;
 }
 
+/// Estimate the code size of the specified BB.
+static unsigned CountBBCodeSize(BasicBlock *BB,
+                                const TargetTransformInfo &TTI) {
+  unsigned Size = 0;
+  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
+    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
+  return Size;
+}
+
+/// Find out the latency of the longest dependence chain in the BB if
+/// LongestChain is true, or the dependence chain containing the compare
+/// instruction feeding the block's conditional branch.
+static unsigned FindDependenceChainLatency(BasicBlock *BB,
+                            DenseMap<Instruction *, unsigned> &Instructions,
+                            const TargetTransformInfo &TTI,
+                            bool LongestChain) {
+  unsigned MaxLatency = 0;
+
+  BasicBlock::iterator II;
+  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
+    unsigned Latency = 0;
+    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
+      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O));
+      if (Op && Instructions.count(Op)) {
+        auto OpLatency = Instructions[Op];
+        if (OpLatency > Latency)
+          Latency = OpLatency;
+      }
+    }
+    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
+    Instructions[&(*II)] = Latency;
+
+    if (Latency > MaxLatency)
+      MaxLatency = Latency;
+  }
+
+  if (LongestChain)
+    return MaxLatency;
+
+  // The length of the dependence chain containing the compare instruction is
+  // wanted, so the terminator must be a BranchInst.
+  assert(isa<BranchInst>(II));
+  BranchInst* Br = cast<BranchInst>(II);
+  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition());
+  if (Cmp && Instructions.count(Cmp))
+    return Instructions[Cmp];
+  else
+    return 0;
+}
+
+/// Instructions in BB2 may depend on instructions in BB1, and instructions
+/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
+/// of instruction in BB1 is I, then the instructions after I can be executed
+/// in parallel with instructions in BB2.
+/// This function returns the latency of I.
+static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2,
+                        BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                        DenseMap<Instruction *, unsigned> &BB1Instructions) {
+  unsigned LastLatency = 0;
+  SmallVector<Instruction *, 16> Worklist;
+  BasicBlock::iterator II;
+  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
+    if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      // Look for users in BB2.
+      bool InBBUser = false;
+      for (User *U : PN->users()) {
+        if (cast<Instruction>(U)->getParent() == BB2) {
+          InBBUser = true;
+          break;
+        }
+      }
+      // No such user, we don't care about this instruction and its operands.
+      if (!InBBUser)
+        break;
+    }
+    Worklist.push_back(&(*II));
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
+      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) {
+        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2)
+          Worklist.push_back(Op);
+        else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) {
+          if (BB1Instructions[Op] > LastLatency)
+            LastLatency = BB1Instructions[Op];
+        }
+      }
+    }
+  }
+
+  return LastLatency;
+}
+
+/// If after if conversion, most of the instructions in this new BB construct a
+/// long and slow dependence chain, it may be slower than cmp/branch, even
+/// if the branch has a high miss rate, because the control dependence is
+/// transformed into data dependence, and control dependence can be speculated,
+/// and thus, the second part can execute in parallel with the first part on
+/// modern OOO processor.
+///
+/// To check this condition, this function finds the length of the dependence
+/// chain in BB1 (only the part that can be executed in parallel with code after
+/// branch in BB2) containing cmp, and if the length is longer than a threshold,
+/// don't perform if conversion.
+///
+/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion.
+/// SpeculationSize contains the code size of IfBlock1 and IfBlock2.
+static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2,
+                             BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                             unsigned SpeculationSize,
+                             const TargetTransformInfo &TTI) {
+  // Accumulated latency of each instruction in their BBs.
+  DenseMap<Instruction *, unsigned> BB1Instructions;
+  DenseMap<Instruction *, unsigned> BB2Instructions;
+
+  if (!TTI.isOutOfOrder())
+    return false;
+
+  unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
+                         + SpeculationSize;
+
+  // We check small BB only since it is more difficult to find unrelated
+  // instructions to fill functional units in a small BB.
+  if (NewBBSize > SmallBBSize)
+    return false;
+
+  auto BB1Chain =
+         FindDependenceChainLatency(BB1, BB1Instructions, TTI, false);
+  auto BB2Chain =
+         FindDependenceChainLatency(BB2, BB2Instructions, TTI, true);
+
+  // If there are many unrelated instructions in the new BB, there will be
+  // other instructions for the processor to issue regardless of the length
+  // of this new dependence chain.
+  // Modern processors can issue 3 or more instructions in each cycle. But in
+  // real world applications, an IPC of 2 is already very good for non-loop
+  // code with small basic blocks. Higher IPC is usually found in programs with
+  // small kernel. So IPC of 2 is more reasonable for most applications.
+  if ((BB1Chain + BB2Chain) * 2 <= NewBBSize)
+    return false;
+
+  // We only care about part of the dependence chain in BB1 that can be
+  // executed in parallel with BB2, so adjust the latency.
+  BB1Chain -=
+      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions);
+
+  // Correctly predicted branch instruction can skip the dependence chain in
+  // BB1, but misprediction has a penalty, so only when the dependence chain is
+  // longer than DependenceChainLatency, then branch is better than select.
+  // Besides misprediction penalty, the threshold value DependenceChainLatency
+  // also depends on branch misprediction rate, taken branch latency and cmov
+  // latency.
+  if (BB1Chain >= DependenceChainLatency)
+    return true;
+
+  return false;
+}
+
 /// Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
@@ -1654,14 +1824,11 @@ namespace {
 
 } // end anonymous namespace
 
-/// Given an unconditional branch that goes to BBEnd,
-/// check whether BBEnd has only two predecessors and the other predecessor
-/// ends with an unconditional branch. If it is true, sink any common code
-/// in the two predecessors to BBEnd.
-static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
-  assert(BI1->isUnconditional());
-  BasicBlock *BBEnd = BI1->getSuccessor(0);
-
+/// Check whether BB's predecessors end with unconditional branches. If it is
+/// true, sink any common code from the predecessors to BB.
+/// We also allow one predecessor to end with conditional branch (but no more
+/// than one).
+static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   // We support two situations:
   //   (1) all incoming arcs are unconditional
   //   (2) one incoming arc is conditional
@@ -1705,7 +1872,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   //
   SmallVector<BasicBlock*,4> UnconditionalPreds;
   Instruction *Cond = nullptr;
-  for (auto *B : predecessors(BBEnd)) {
+  for (auto *B : predecessors(BB)) {
     auto *T = B->getTerminator();
     if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
       UnconditionalPreds.push_back(B);
@@ -1773,8 +1940,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
-    if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds,
-                                ".sink.split"))
+    if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
       // Edges couldn't be split.
       return false;
     Changed = true;
@@ -2048,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
+  // Don't do if conversion for long dependence chain.
+  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
+                              CountBBCodeSize(ThenBB, TTI), TTI))
+    return false;
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
@@ -2355,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       }
   }
 
+  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
+                              AggressiveInsts.size(), TTI))
+    return false;
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
 
@@ -5728,9 +5903,6 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
   BasicBlock *BB = BI->getParent();
   BasicBlock *Succ = BI->getSuccessor(0);
 
-  if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI))
-    return true;
-
   // If the Terminator is the only non-phi instruction, simplify the block.
   // If LoopHeader is provided, check if the block or its successor is a loop
   // header. (This is for early invocations before loop simplify and
@@ -6008,6 +6180,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   if (MergeBlockIntoPredecessor(BB))
     return true;
 
+  if (SinkCommon && Options.SinkCommonInsts)
+    Changed |= SinkCommonCodeFromPredecessors(BB);
+
   IRBuilder<> Builder(BB);
 
   // If there is a trivial two-entry PHI node in this basic block, and we can
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index fbcdc0df0f1c..52f32cda2609 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5049,13 +5049,13 @@ bool LoopVectorizationLegality::canVectorize() {
   bool Result = true;
   
   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
-  if (DoExtraAnalysis)
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
+    DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-  if (DoExtraAnalysis)
+    if (DoExtraAnalysis)
       Result = false;
     else
       return false;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 76ba62f5d596..a7ccd3faec44 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -646,23 +646,17 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
-                     int OpdNum = 0);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
 
-  /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
-  /// operand corrsponding to this tree entry \p E for the user tree entry
-  /// indicated by \p UserIndx.
-  //  In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
-  Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
 
-  /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
-  /// the ordinality of operand corrsponding to the \p VL of scalar values for the
-  /// user indicated by \p UserIndx this \p VL feeds into.
-  Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
+  /// Vectorize a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
@@ -708,16 +702,6 @@ private:
       return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
-    /// \returns true if the scalars in VL are found in this tree entry.
-    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
-        ScalarEvolution &SE) const {
-      assert(VL.size() == Scalars.size() && "Invalid size");
-      SmallVector<Value *, 8> List;
-      if (!sortLoadAccesses(VL, DL, SE, List))
-        return false;
-      return std::equal(List.begin(), List.end(), Scalars.begin());
-    }
-
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -727,14 +711,6 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather = false;
 
-    /// Records optional shuffle mask for the uses of jumbled memory accesses.
-    /// For example, a non-empty ShuffleMask[1] represents the permutation of
-    /// lanes that operand #1 of this vectorized instruction should undergo
-    /// before feeding this vectorized instruction, whereas an empty
-    /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
-    /// instruction need not be permuted at all.
-    SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
-
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -750,31 +726,12 @@ private:
 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          int &UserTreeIdx, const InstructionsState &S,
-                          ArrayRef<unsigned> ShuffleMask = None,
-                          int OpdNum = 0) {
-    assert((!Vectorized || S.Opcode != 0) &&
-           "Vectorized TreeEntry without opcode");
+                          int &UserTreeIdx) {
     VectorizableTree.emplace_back(VectorizableTree);
-
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
-
-    TreeEntry *UserTreeEntry = nullptr;
-    if (UserTreeIdx != -1)
-      UserTreeEntry = &VectorizableTree[UserTreeIdx];
-
-    if (UserTreeEntry && !ShuffleMask.empty()) {
-      if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
-        UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
-      assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
-             "Mask already present");
-      using mask = SmallVector<unsigned, 4>;
-      mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
-      UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
-    }
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -1427,34 +1384,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 }
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            int UserTreeIdx, int OpdNum) {
+                            int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1466,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1477,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
     }
@@ -1496,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (getTreeEntry(I)) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1506,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1520,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1529,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     for (unsigned j = i + 1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
 
@@ -1544,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1563,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, S);
+            newTreeEntry(VL, false, UserTreeIdx);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1578,7 +1535,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1590,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       } else {
         BS.cancelScheduling(VL, VL0);
       }
-      newTreeEntry(VL, Reuse, UserTreeIdx, S);
+      newTreeEntry(VL, Reuse, UserTreeIdx);
       return;
     }
     case Instruction::Load: {
@@ -1605,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1616,13 +1573,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
       }
 
       // Check if the loads are consecutive, reversed, or neither.
+      // TODO: What we really want is to sort the loads, but for now, check
+      // the two likely directions.
       bool Consecutive = true;
       bool ReverseConsecutive = true;
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1636,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, UserTreeIdx, S);
+        newTreeEntry(VL, true, UserTreeIdx);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1650,41 +1609,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             break;
           }
 
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
+
       if (ReverseConsecutive) {
-        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
         ++NumLoadsWantToChangeOrder;
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
-        return;
-      }
-
-      if (VL.size() > 2) {
-        bool ShuffledLoads = true;
-        SmallVector<Value *, 8> Sorted;
-        SmallVector<unsigned, 4> Mask;
-        if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
-          auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
-          for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
-            if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
-              ShuffledLoads = false;
-              break;
-            }
-          }
-          // TODO: Tracking how many load wants to have arbitrary shuffled order
-          // would be usefull.
-          if (ShuffledLoads) {
-            DEBUG(dbgs() << "SLP: added a vector of loads which needs "
-                            "permutation of loaded lanes.\n");
-            newTreeEntry(NewVL, true, UserTreeIdx, S,
-                         makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
-            return;
-          }
-        }
+        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
+      } else {
+        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       }
-
-      DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     case Instruction::ZExt:
@@ -1704,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1718,7 +1651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1732,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1747,7 +1680,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1770,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1779,7 +1712,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1789,7 +1722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1799,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1812,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1824,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1837,7 +1770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1846,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1869,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1883,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1894,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, S);
+            newTreeEntry(VL, false, UserTreeIdx);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1907,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1922,7 +1855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1931,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // then do not vectorize this instruction.
       if (!S.IsAltShuffle) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1943,7 +1876,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         ValueList Left, Right;
         reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1953,13 +1886,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2797,20 +2730,12 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
   return nullptr;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.Opcode) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      TreeEntry *UserTreeEntry = nullptr;
-      if (UserIndx != -1)
-        UserTreeEntry = &VectorizableTree[UserIndx];
-
-      if (E->isSame(VL) ||
-          (UserTreeEntry &&
-           (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
-           !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
-           E->isFoundJumbled(VL, *DL, *SE)))
-        return vectorizeTree(E, OpdNum, UserIndx);
+      if (E->isSame(VL))
+        return vectorizeTree(E);
     }
   }
 
@@ -2822,10 +2747,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
-  TreeEntry *UserTreeEntry = nullptr;
   if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
@@ -2845,10 +2769,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
     return V;
   }
 
-  assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
-         "Expected user tree entry, missing!");
-  int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
-
   unsigned ShuffleOrOp = S.IsAltShuffle ?
            (unsigned) Instruction::ShuffleVector : S.Opcode;
   switch (ShuffleOrOp) {
@@ -2878,7 +2798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands, i, CurrIndx);
+        Value *Vec = vectorizeTree(Operands);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -2931,7 +2851,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
+      Value *InVec = vectorizeTree(INVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2952,8 +2872,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *L = vectorizeTree(LHSV, 0, CurrIndx);
-      Value *R = vectorizeTree(RHSV, 1, CurrIndx);
+      Value *L = vectorizeTree(LHSV);
+      Value *R = vectorizeTree(RHSV);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2980,9 +2900,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
-      Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
-      Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
+      Value *Cond = vectorizeTree(CondVec);
+      Value *True = vectorizeTree(TrueVec);
+      Value *False = vectorizeTree(FalseVec);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3023,8 +2943,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
-      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3045,20 +2965,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       // sink them all the way down past store instructions.
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      if (UserIndx != -1)
-        UserTreeEntry = &VectorizableTree[UserIndx];
-
-      bool isJumbled = false;
-      LoadInst *LI = NULL;
-      if (UserTreeEntry &&
-          (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
-          !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
-        isJumbled = true;
-        LI = cast<LoadInst>(E->Scalars[0]);
-      } else {
-        LI = cast<LoadInst>(VL0);
-      }
-
+      LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
@@ -3080,21 +2987,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
-      propagateMetadata(LI, E->Scalars);
-
-      if (isJumbled) {
-        SmallVector<Constant *, 8> Mask;
-        for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
-          Mask.push_back(Builder.getInt32(LaneEntry));
-        // Generate shuffle for jumbled memory access
-        Value *Undef = UndefValue::get(VecTy);
-        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
-                                                  ConstantVector::get(Mask));
-        E->VectorizedValue = Shuf;
-        ++NumVectorInstructions;
-        return Shuf;
-      }
-      return LI;
+      return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -3107,7 +3000,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
+      Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
@@ -3133,7 +3026,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
+      Value *Op0 = vectorizeTree(Op0VL);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -3142,7 +3035,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
+        Value *OpVec = vectorizeTree(OpVL);
         OpVecs.push_back(OpVec);
       }
 
@@ -3181,7 +3074,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
+        Value *OpVec = vectorizeTree(OpVL);
         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3212,8 +3105,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
-      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3313,14 +3206,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert((!E->NeedToGather) && "Extracting from a gather list");
+    assert(!E->NeedToGather && "Extracting from a gather list");
 
-    Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
-    if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
-      Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
-    } else {
-      Vec = E->VectorizedValue;
-    }
+    Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
@@ -4017,6 +3905,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
     Roots.push_back(I->getOperand(0));
+    break;
   case Instruction::ZExt:
   case Instruction::SExt:
     break;