32 files changed, 820 insertions, 3921 deletions
diff --git a/lib/Transforms/Coroutines/CoroInstr.h b/lib/Transforms/Coroutines/CoroInstr.h
index 5c666bdfea1f..9a8cc5a2591c 100644
--- a/lib/Transforms/Coroutines/CoroInstr.h
+++ b/lib/Transforms/Coroutines/CoroInstr.h
@@ -58,10 +58,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_subfn_addr;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -70,10 +70,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_alloc;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -175,10 +175,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_id;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -187,10 +187,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_frame;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -203,10 +203,10 @@ public:
   Value *getFrame() const { return getArgOperand(FrameArg); }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_free;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -221,10 +221,10 @@ public:
   Value *getMem() const { return getArgOperand(MemArg); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_begin;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -233,10 +233,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_save;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -254,10 +254,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_promise;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -279,10 +279,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_suspend;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -291,10 +291,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_size;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -310,10 +310,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_end;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 087a8aa2c624..5b1b58b89c32 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -56,10 +56,6 @@ RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
-                    cl::desc("Run the BB vectorization passes"));
-
-static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
@@ -138,8 +134,8 @@ static cl::opt<int> PreInlineThreshold(
              "(default = 75)"));
 
 static cl::opt<bool> EnableEarlyCSEMemSSA(
-    "enable-earlycse-memssa", cl::init(false), cl::Hidden,
-    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)"));
+    "enable-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)"));
 
 static cl::opt<bool> EnableGVNHoist(
     "enable-gvn-hoist", cl::init(false), cl::Hidden,
@@ -166,7 +162,6 @@ PassManagerBuilder::PassManagerBuilder() {
     Inliner = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    BBVectorize = RunBBVectorization;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
@@ -263,11 +258,12 @@ void PassManagerBuilder::populateFunctionPassManager(
 
 // Do PGO instrumentation generation or use pass as the option specified.
 void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
-  if (!EnablePGOInstrGen && PGOInstrUse.empty())
+  if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
     return;
   // Perform the preinline and cleanup passes for O1 and above.
   // And avoid doing them if optimizing for size.
-  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) {
+  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner &&
+      PGOSampleUse.empty()) {
     // Create preinline pass. We construct an InlineParams object and specify
     // the threshold here to avoid the command line options of the regular
     // inliner to influence pre-inlining. The only fields of InlineParams we
@@ -383,26 +379,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
-  if (!RunSLPAfterLoopVectorization) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
-    }
-  }
+  if (!RunSLPAfterLoopVectorization && SLPVectorize)
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
@@ -634,28 +612,10 @@ void PassManagerBuilder::populateModulePassManager(
     addInstructionCombiningPass(MPM);
   }
 
-  if (RunSLPAfterLoopVectorization) {
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-      if (OptLevel > 1 && ExtraVectorizerPasses) {
-        MPM.add(createEarlyCSEPass());
-      }
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
+  if (RunSLPAfterLoopVectorization && SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
     }
   }
 
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 656421ee58df..ac4765f96075 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1484,7 +1484,8 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
                                                ModuleAnalysisManager &AM) {
 
-  SampleProfileLoader SampleLoader(SampleProfileFile);
+  SampleProfileLoader SampleLoader(
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
 
   SampleLoader.doInitialization(M);
 
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 802f470ffe1f..8d494fe9cde2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -371,6 +371,7 @@ void splitAndWriteThinLTOBitcode(
                 /*GenerateHash=*/true, &ModHash);
   W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
                 &MergedMIndex);
+  W.writeSymtab();
   W.writeStrtab();
   OS << Buffer;
 
@@ -385,6 +386,7 @@ void splitAndWriteThinLTOBitcode(
                    /*GenerateHash=*/false, &ModHash);
     W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
                    &MergedMIndex);
+    W2.writeSymtab();
     W2.writeStrtab();
     *ThinLinkOS << Buffer;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d3d8cefe9735..db98be2c98f5 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2301,10 +2301,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) {
   // (~B | A) ^ (~A | B) -> A ^ B
   // (~A | B) ^ (A | ~B) -> A ^ B
   // (B | ~A) ^ (A | ~B) -> A ^ B
-  if ((match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2314,10 +2314,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) {
   // (~B & A) ^ (~A & B) -> A ^ B
   // (~A & B) ^ (A & ~B) -> A ^ B
   // (B & ~A) ^ (A & ~B) -> A ^ B
-  if ((match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_c_And(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2456,10 +2456,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+  // not (cmp A, B) = !cmp A, B
   ICmpInst::Predicate Pred;
-  if (match(Op0, m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))) &&
-      match(Op1, m_AllOnes())) {
+  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
     cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
     return replaceInstUsesWith(I, Op0);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dbed7ad4eae8..3770021de100 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1985,7 +1985,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *X = nullptr;
 
     // bitreverse(bitreverse(x)) -> x
-    if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X))))
+    if (match(IIOperand, m_BitReverse(m_Value(X))))
       return replaceInstUsesWith(CI, X);
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 6ad32490a328..58b8b2f52629 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -112,10 +112,10 @@ static bool subWithOverflow(Constant *&Result, Constant *In1,
 
 /// Given an icmp instruction, return true if any use of this comparison is a
 /// branch on sign bit comparison.
-static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) {
+static bool hasBranchUse(ICmpInst &I) {
   for (auto *U : I.users())
     if (isa<BranchInst>(U))
-      return isSignBit;
+      return true;
   return false;
 }
 
@@ -1448,12 +1448,13 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     // of a test and branch. So we avoid canonicalizing in such situations
     // because test and branch instruction has better branch displacement
     // than compare and branch instruction.
-    if (!isBranchOnSignBitCheck(Cmp, IsSignBit) && !Cmp.isEquality()) {
-      if (auto *AI = Intersection.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
-      if (auto *AD = Difference.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
-    }
+    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+      return nullptr;
+
+    if (auto *AI = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
+    if (auto *AD = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
   }
 
   return nullptr;
@@ -3301,12 +3302,12 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     return nullptr;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const CmpInst::Predicate Pred = I.getPredicate();
   Value *A, *B, *C, *D;
   if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
     if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0
       Value *OtherVal = A == Op1 ? B : A;
-      return new ICmpInst(I.getPredicate(), OtherVal,
-                          Constant::getNullValue(A->getType()));
+      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
     }
 
     if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
@@ -3316,26 +3317,25 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
           Op1->hasOneUse()) {
         Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
         Value *Xor = Builder->CreateXor(C, NC);
-        return new ICmpInst(I.getPredicate(), A, Xor);
+        return new ICmpInst(Pred, A, Xor);
       }
 
       // A^B == A^D -> B == D
       if (A == C)
-        return new ICmpInst(I.getPredicate(), B, D);
+        return new ICmpInst(Pred, B, D);
       if (A == D)
-        return new ICmpInst(I.getPredicate(), B, C);
+        return new ICmpInst(Pred, B, C);
       if (B == C)
-        return new ICmpInst(I.getPredicate(), A, D);
+        return new ICmpInst(Pred, A, D);
       if (B == D)
-        return new ICmpInst(I.getPredicate(), A, C);
+        return new ICmpInst(Pred, A, C);
     }
   }
 
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
     // A == (A^B)  ->  B == 0
     Value *OtherVal = A == Op0 ? B : A;
-    return new ICmpInst(I.getPredicate(), OtherVal,
-                        Constant::getNullValue(A->getType()));
+    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
   }
 
   // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
@@ -3380,8 +3380,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     APInt Pow2 = Cst1->getValue() + 1;
     if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
         Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
-      return new ICmpInst(I.getPredicate(), A,
-                          Builder->CreateTrunc(B, A->getType()));
+      return new ICmpInst(Pred, A, Builder->CreateTrunc(B, A->getType()));
   }
 
   // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
@@ -3393,12 +3392,11 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     unsigned TypeBits = Cst1->getBitWidth();
     unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
     if (ShAmt < TypeBits && ShAmt != 0) {
-      ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
-                                     ? ICmpInst::ICMP_UGE
-                                     : ICmpInst::ICMP_ULT;
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
       Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
       APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
-      return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      return new ICmpInst(NewPred, Xor, Builder->getInt(CmpVal));
     }
   }
 
@@ -3412,8 +3410,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
       Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
                                       I.getName() + ".mask");
-      return new ICmpInst(I.getPredicate(), And,
-                          Constant::getNullValue(Cst1->getType()));
+      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
     }
   }
 
@@ -3437,7 +3434,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       CmpV <<= ShAmt;
 
       Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
-      return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
+      return new ICmpInst(Pred, Mask, Builder->getInt(CmpV));
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 1b0fe84dd4dd..87f11467b95e 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -131,11 +131,10 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
     return true;
 
   // A vector of constant integers can be inverted easily.
-  Constant *CV;
-  if (V->getType()->isVectorTy() && match(V, PatternMatch::m_Constant(CV))) {
+  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
     unsigned NumElts = V->getType()->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = CV->getAggregateElement(i);
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
       if (!Elt)
         return false;
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index ca370c73fca4..26bee204e5a4 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -661,6 +661,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
       return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -690,6 +693,10 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
                                                 Name + ".elt");
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
       auto *L = IC.Builder->CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
+      // Propagate AA metadata. It'll still be valid on the narrowed load.
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
       V = IC.Builder->CreateInsertValue(V, L, i);
     }
 
@@ -702,6 +709,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
       return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -734,6 +744,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
                                                 Name + ".elt");
       auto *L = IC.Builder->CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
                                               Name + ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
       V = IC.Builder->CreateInsertValue(V, L, i);
       Offset += EltSize;
     }
@@ -1192,7 +1205,11 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
                                                 AddrName);
       auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      llvm::Instruction *NS =
+          IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
     }
 
     return true;
@@ -1239,7 +1256,10 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
                                                 AddrName);
       auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, Offset);
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      Instruction *NS = IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
       Offset += EltSize;
     }
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 365c4ba75154..579639a6194e 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -227,8 +227,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         if (I.hasNoUnsignedWrap())
           Shl->setHasNoUnsignedWrap();
         if (I.hasNoSignedWrap()) {
-          uint64_t V;
-          if (match(NewCst, m_ConstantInt(V)) && V != Width - 1)
+          const APInt *V;
+          if (match(NewCst, m_APInt(V)) && *V != Width - 1)
             Shl->setHasNoSignedWrap();
         }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 33951e66497a..80c6595904e1 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1167,6 +1167,23 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
+  // Canonicalize a one-use integer compare with a non-canonical predicate by
+  // inverting the predicate and swapping the select operands. This matches a
+  // compare canonicalization for conditional branches.
+  // TODO: Should we do the same for FP compares?
+  CmpInst::Predicate Pred;
+  if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap true/false values and condition.
+    CmpInst *Cond = cast<CmpInst>(CondVal);
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
+    Worklist.Add(Cond);
+    return &SI;
+  }
+
   if (SelType->getScalarType()->isIntegerTy(1) &&
       TrueVal->getType() == CondVal->getType()) {
     if (match(TrueVal, m_One())) {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 02fac4fb37a4..723414635d6f 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2425,9 +2425,15 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       Builder->SetInsertPoint(L);
       Value *GEP = Builder->CreateInBoundsGEP(L->getType(),
                                               L->getPointerOperand(), Indices);
+      Instruction *NL = Builder->CreateLoad(GEP);
+      // Whatever aliasing information we had for the orignal load must also
+      // hold for the smaller load, so propagate the annotations.
+      AAMDNodes Nodes;
+      L->getAAMetadata(Nodes);
+      NL->setAAMetadata(Nodes);
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use replaceInstUsesWith().
-      return replaceInstUsesWith(EV, Builder->CreateLoad(GEP));
+      return replaceInstUsesWith(EV, NL);
     }
   // We could simplify extracts from other values. Note that nested extracts may
   // already be simplified implicitly by the above: extract (extract (insert) )
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index c3810366bf22..a49c9b68c97d 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -38,6 +38,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -340,6 +341,49 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
+
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+  Value *Opnd = Inst->getOperand(Idx);
+
+  // Visit constant integers.
+  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+    return;
+  }
+
+  // Visit cast instructions that have constant integers.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    // Only visit cast instructions, which have been skipped. All other
+    // instructions should have already been visited.
+    if (!CastInst->isCast())
+      return;
+
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the cast instruction.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+
+  // Visit constant expressions that have constant integers.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    // Only visit constant cast expressions.
+    if (!ConstExpr->isCast())
+      return;
+
+    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the constant expression.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+}
+
+
 /// \brief Scan the instruction for expensive integer constants and record them
 /// in the constant candidate vector.
 void ConstantHoistingPass::collectConstantCandidates(
@@ -365,44 +409,25 @@ void ConstantHoistingPass::collectConstantCandidates(
   if (AI && AI->isStaticAlloca())
     return;
 
-  // Scan all operands.
-  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
-    Value *Opnd = Inst->getOperand(Idx);
-
-    // Visit constant integers.
-    if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
-      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-      continue;
-    }
-
-    // Visit cast instructions that have constant integers.
-    if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
-      // Only visit cast instructions, which have been skipped. All other
-      // instructions should have already been visited.
-      if (!CastInst->isCast())
-        continue;
-
-      if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the cast instruction.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
+  // Constants in GEPs that index into a struct type should not be hoisted.
+  if (isa<GetElementPtrInst>(Inst)) {
+    gep_type_iterator GTI = gep_type_begin(Inst);
+
+    // Collect constant for first operand.
+    collectConstantCandidates(ConstCandMap, Inst, 0);
+    // Scan rest operands.
+    for (unsigned Idx = 1, E = Inst->getNumOperands(); Idx != E; ++Idx, ++GTI) {
+      // Only collect constants that index into a non struct type.
+      if (!GTI.isStruct()) {
+        collectConstantCandidates(ConstCandMap, Inst, Idx);
       }
     }
+    return;
+  }
 
-    // Visit constant expressions that have constant integers.
-    if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
-      // Only visit constant cast expressions.
-      if (!ConstExpr->isCast())
-        continue;
-
-      if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the constant expression.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
-      }
-    }
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx);
   } // end of for all operands
 }
 
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 2f96c3064b86..a40c22c3fce9 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -917,7 +917,6 @@ LoopConstrainer::calculateSubRanges() const {
   // I think we can be more aggressive here and make this nuw / nsw if the
   // addition that feeds into the icmp for the latch's terminating branch is nuw
   // / nsw.  In any case, a wrapping 2's complement addition is safe.
-  ConstantInt *One = ConstantInt::get(Ty, 1);
   const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
   const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
 
@@ -948,8 +947,9 @@ LoopConstrainer::calculateSubRanges() const {
     //    will be an empty range.  Returning an empty range is always safe.
     //
 
-    Smallest = SE.getAddExpr(End, SE.getSCEV(One));
-    Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+    const SCEV *One = SE.getOne(Ty);
+    Smallest = SE.getAddExpr(End, One);
+    Greatest = SE.getAddExpr(Start, One);
   }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 62aa6ee48069..530a68424d5c 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -131,7 +131,7 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, int OptLevel,
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound) {
@@ -158,7 +158,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowPeeling = true;
 
   // Override with any target specific settings
-  TTI.getUnrollingPreferences(L, UP);
+  TTI.getUnrollingPreferences(L, SE, UP);
 
   // Apply size attributes
   if (L->getHeader()->getParent()->optForSize()) {
@@ -699,7 +699,7 @@ static uint64_t getUnrolledLoopSize(
 // Calculates unroll count and writes it to UP.Count.
 static bool computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
     unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
@@ -770,7 +770,7 @@ static bool computeUnrollCount(
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, *SE, TTI,
+              L, FullUnrollTripCount, DT, SE, TTI,
               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -836,6 +836,8 @@ static bool computeUnrollCount(
     } else {
       UP.Count = TripCount;
     }
+    if (UP.Count > UP.MaxCount)
+      UP.Count = UP.MaxCount;
     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
         UP.Count != TripCount)
       ORE->emit(
@@ -926,7 +928,7 @@ static bool computeUnrollCount(
 }
 
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
-                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            ScalarEvolution &SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
                             bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
@@ -948,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
-      ProvidedRuntime, ProvidedUpperBound);
+      L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
     return false;
@@ -977,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
     ExitingBlock = L->getExitingBlock();
   if (ExitingBlock) {
-    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
-    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
   }
 
   // If the loop contains a convergent operation, the prelude we'd add
@@ -1000,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // count.
   bool MaxOrZero = false;
   if (!TripCount) {
-    MaxTripCount = SE->getSmallConstantMaxTripCount(L);
-    MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
+    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
     // We can unroll by the upper bound amount if it's generally allowed or if
     // we know that the loop is executed either the upper bound or zero times.
     // (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -1030,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // Unroll the loop.
   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
                   UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
-                  TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
+                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
                   PreserveLCSSA))
     return false;
 
@@ -1073,7 +1075,7 @@ public:
 
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1157,7 +1159,7 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!AllowPartialUnrolling)
     AllowPartialParam = RuntimeParam = UpperBoundParam = false;
   bool Changed = tryToUnrollLoop(
-      &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
       /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
       /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 7a7624f77542..9cf01c6582b5 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -2423,8 +2423,7 @@ void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
   AllTempInstructions.insert(Op);
   PHIOfOpsPHIs[BB].push_back(Op);
   TempToBlock[Op] = BB;
-  if (ExistingValue)
-    RealToTemp[ExistingValue] = Op;
+  RealToTemp[ExistingValue] = Op;
 }
 
 static bool okayForPHIOfOps(const Instruction *I) {
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 6da551bd7efd..cdba0062953f 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1894,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) {
         Op = Op->user_back();
       RedoInsts.insert(Op);
     }
+
+  MadeChange = true;
 }
 
 // Canonicalize expressions of the following form:
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index a52739bb76f7..a73e9aec0617 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1954,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS,
       // to identify the newly generated AlternateRootPhi (.base version of phi)
       // and RootOfChain (the original phi node itself) are the same, so that we
       // can rematerialize the gep and casts. This is a workaround for the
-      // deficieny in the findBasePointer algorithm.
+      // deficiency in the findBasePointer algorithm.
       if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
         continue;
       // Now that the phi nodes are proved to be the same, assert that
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 80fbbeb6829b..4729f4ef5956 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2402,9 +2402,20 @@ private:
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
 
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
       // Try to preserve nonnull metadata
-      if (TargetTy->isPointerTy())
-        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -3580,10 +3591,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
       LoadInst *PLoad = IRB.CreateAlignedLoad(
           getAdjustedPtr(IRB, DL, BasePtr,
-                         APInt(DL.getPointerSizeInBits(), PartOffset),
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
           getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
           LI->getName());
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 5d57ed9718fb..30d8856cfbef 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -59,6 +59,33 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
   // Landing pads must be in the function where they were inserted for cleanup.
   if (BB.isEHPad())
     return false;
+  // taking the address of a basic block moved to another function is illegal
+  if (BB.hasAddressTaken())
+    return false;
+
+  // don't hoist code that uses another basicblock address, as it's likely to
+  // lead to unexpected behavior, like cross-function jumps
+  SmallPtrSet<User const *, 16> Visited;
+  SmallVector<User const *, 16> ToVisit;
+
+  for (Instruction const &Inst : BB)
+    ToVisit.push_back(&Inst);
+
+  while (!ToVisit.empty()) {
+    User const *Curr = ToVisit.pop_back_val();
+    if (!Visited.insert(Curr).second)
+      continue;
+    if (isa<BlockAddress const>(Curr))
+      return false; // even a reference to self is likely to be not compatible
+
+    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+      continue;
+
+    for (auto const &U : Curr->operands()) {
+      if (auto *UU = dyn_cast<User>(U))
+        ToVisit.push_back(UU);
+    }
+  }
 
   // Don't hoist code containing allocas, invokes, or vastarts.
   for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5f85e17927fa..9ad2b707e6b2 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
 
@@ -45,6 +46,10 @@ using namespace llvm;
 
 STATISTIC(NumRuntimeUnrolled,
           "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
 
 /// Connect the unrolling prolog code to the original loop.
 /// The unrolling prolog code contains code to execute the
@@ -285,15 +290,13 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
 /// If loop structure is cloned InsertTop should be new preheader, InsertBot
 /// new loop exit.
-///
-static void CloneLoopBlocks(Loop *L, Value *NewIter,
-                            const bool CreateRemainderLoop,
-                            const bool UseEpilogRemainder,
-                            BasicBlock *InsertTop, BasicBlock *InsertBot,
-                            BasicBlock *Preheader,
-                            std::vector<BasicBlock *> &NewBlocks,
-                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            DominatorTree *DT, LoopInfo *LI) {
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+                const bool UseEpilogRemainder, BasicBlock *InsertTop,
+                BasicBlock *InsertBot, BasicBlock *Preheader,
+                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -418,7 +421,10 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
     // Set operand 0 to refer to the loop id itself.
     NewLoopID->replaceOperandWith(0, NewLoopID);
     NewLoop->setLoopID(NewLoopID);
+    return NewLoop;
   }
+  else
+    return nullptr;
 }
 
 /// Insert code in the prolog/epilog code when unrolling a loop with a
@@ -465,29 +471,52 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
-  if (!L->getExitingBlock())
+  if (!UnrollRuntimeMultiExit && !L->getExitingBlock())
     return false;
 
-  // Make sure the loop is in canonical form, and there is a single
-  // exit block only.
+  // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm())
     return false;
 
   // Guaranteed by LoopSimplifyForm.
   BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
 
   BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop
-  if (!LatchExit)
+  if (!LatchExit && !UnrollRuntimeMultiExit)
     return false;
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
-  // targets of the Latch be the single exit block out of the loop. This needs
+  // targets of the Latch be an exit block out of the loop. This needs
   // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-  assert((LatchBR->getSuccessor(0) == LatchExit ||
-          LatchBR->getSuccessor(1) == LatchExit) &&
-         "one of the loop latch successors should be "
-         "the exit block!");
-  (void)LatchBR;
+  assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) &&
+         "one of the loop latch successors should be the exit block!");
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks.
+  if (!LatchExit) {
+    assert(UseEpilogRemainder && "Multi exit unrolling is currently supported "
+                                 "unrolling with epilog remainder only!");
+    LatchExit = LatchBR->getSuccessor(ExitIndex);
+    // We rely on LCSSA form being preserved when the exit blocks are
+    // transformed.
+    if (!PreserveLCSSA)
+      return false;
+    // TODO: Support multiple exiting blocks jumping to the `LatchExit`. This
+    // will need updating the logic in connectEpilog.
+    if (!LatchExit->getSinglePredecessor())
+        return false;
+    SmallVector<BasicBlock *, 4> Exits;
+    L->getUniqueExitBlocks(Exits);
+    for (auto *BB : Exits)
+      if (BB != LatchExit)
+        OtherExits.push_back(BB);
+  }
+
+  assert(LatchExit && "Latch Exit should exist!");
+
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
@@ -495,7 +524,11 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
 
   // Only unroll loops with a computable trip count, and the trip count needs
   // to be an int value (allowing a pointer type is a TODO item).
-  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  // We calculate the backedge count by using getExitCount on the Latch block,
+  // which is proven to be the only exiting block in this loop. This is same as
+  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+  // exiting blocks).
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy())
     return false;
@@ -508,7 +541,6 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
-  BasicBlock *Header = L->getHeader();
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
@@ -650,8 +682,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
-  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
-                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
+  Loop *remainderLoop = CloneLoopBlocks(
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
+      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Insert the cloned blocks into the function.
   F->getBasicBlockList().splice(InsertBot->getIterator(),
@@ -659,6 +692,42 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                 NewBlocks[0]->getIterator(),
                                 F->end());
 
+  // Now the loop blocks are cloned and the other exiting blocks from the
+  // remainder are connected to the original Loop's exit blocks. The remaining
+  // work is to update the phi nodes in the original loop, and take in the
+  // values from the cloned region. Also update the dominator info for
+  // OtherExits, since we have new edges into OtherExits.
+  for (auto *BB : OtherExits) {
+   for (auto &II : *BB) {
+
+     // Given we preserve LCSSA form, we know that the values used outside the
+     // loop will be used through these phi nodes at the exit blocks that are
+     // transformed below.
+     if (!isa<PHINode>(II))
+       break;
+     PHINode *Phi = cast<PHINode>(&II);
+     unsigned oldNumOperands = Phi->getNumIncomingValues();
+     // Add the incoming values from the remainder code to the end of the phi
+     // node.
+     for (unsigned i =0; i < oldNumOperands; i++){
+       Value *newVal = VMap[Phi->getIncomingValue(i)];
+       if (!newVal) {
+         assert(isa<Constant>(Phi->getIncomingValue(i)) &&
+                "VMap should exist for all values except constants!");
+         newVal = Phi->getIncomingValue(i);
+       }
+       Phi->addIncoming(newVal,
+                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     }
+   }
+   // Update the dominator info because the immediate dominator is no longer the
+   // header of the original Loop. BB has edges both from L and remainder code.
+   // Since the preheader determines which loop is run (L or directly jump to
+   // the remainder code), we set the immediate dominator as the preheader.
+   if (DT)
+     DT->changeImmediateDominator(BB, PreHeader);
+  }
+
   // Loop structure should be the following:
   //  Epilog             Prolog
   //
@@ -721,6 +790,19 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
+  // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+  // cannot rely on the LoopUnrollPass to do this because it only does
+  // canonicalization for parent/subloops and not the sibling loops.
+  if (OtherExits.size() > 0) {
+    // Generate dedicated exit blocks for the original loop, to preserve
+    // LoopSimplifyForm.
+    formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
+    // Generate dedicated exit blocks for the remainder loop if one exists, to
+    // preserve LoopSimplifyForm.
+    if (remainderLoop)
+      formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
+  }
+
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 0ed33945ef40..58b70be95d99 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -528,8 +528,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   return false;
 }
 
-bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                                                  DominatorTree *DT) {
+bool RecurrenceDescriptor::isFirstOrderRecurrence(
+    PHINode *Phi, Loop *TheLoop,
+    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
 
   // Ensure the phi node is in the loop header and has two incoming values.
   if (Phi->getParent() != TheLoop->getHeader() ||
@@ -551,12 +552,24 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   // Get the previous value. The previous value comes from the latch edge while
   // the initial value comes form the preheader edge.
   auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
-  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) ||
+      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
     return false;
 
   // Ensure every user of the phi node is dominated by the previous value.
   // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
+  // TODO: Consider extending this sinking to handle other kinds of instructions
+  // and expressions, beyond sinking a single cast past Previous.
+  if (Phi->hasOneUse()) {
+    auto *I = Phi->user_back();
+    if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
+        DT->dominates(Previous, I->user_back())) {
+      SinkAfter[I] = Previous;
+      return true;
+    }
+  }
+
   for (User *U : Phi->users())
     if (auto *I = dyn_cast<Instruction>(U)) {
       if (!DT->dominates(Previous, I))
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 0a51f9a0e4a2..1c2a60a6b8b2 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -27,7 +27,6 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore,
   BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
                                           F, NewBB);
 
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
   IRBuilder<> Builder(OrigBB->getTerminator());
 
   // SrcAddr and DstAddr are expected to be pointer types,
@@ -39,6 +38,11 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore,
   SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
   DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
 
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
   IRBuilder<> LoopBuilder(LoopBB);
   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
@@ -167,6 +171,7 @@ static void createMemMoveLoop(Instruction *InsertBefore,
 static void createMemSetLoop(Instruction *InsertBefore,
                              Value *DstAddr, Value *CopyLen, Value *SetValue,
                              unsigned Align, bool IsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   BasicBlock *NewBB =
@@ -174,7 +179,6 @@ static void createMemSetLoop(Instruction *InsertBefore,
   BasicBlock *LoopBB
     = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
 
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
   IRBuilder<> Builder(OrigBB->getTerminator());
 
   // Cast pointer to the type of value getting stored
@@ -182,9 +186,14 @@ static void createMemSetLoop(Instruction *InsertBefore,
   DstAddr = Builder.CreateBitCast(DstAddr,
                                   PointerType::get(SetValue->getType(), dstAS));
 
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
   IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
-  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
 
   LoopBuilder.CreateStore(
       SetValue,
@@ -192,7 +201,7 @@ static void createMemSetLoop(Instruction *InsertBefore,
       IsVolatile);
 
   Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
   LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
diff --git a/lib/Transforms/Utils/OrderedInstructions.cpp b/lib/Transforms/Utils/OrderedInstructions.cpp
index 2e67e0def5b9..dc780542ce68 100644
--- a/lib/Transforms/Utils/OrderedInstructions.cpp
+++ b/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -27,7 +27,6 @@ bool OrderedInstructions::dominates(const Instruction *InstA,
     if (OBB == OBBMap.end())
       OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
     return OBB->second->dominates(InstA, InstB);
-  } else {
-    return DT->dominates(InstA->getParent(), InstB->getParent());
   }
+  return DT->dominates(InstA->getParent(), InstB->getParent());
 }
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 1260e35e934d..d4cdaede6b86 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -34,6 +33,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
 #include <algorithm>
 #define DEBUG_TYPE "predicateinfo"
 using namespace llvm;
@@ -106,14 +106,27 @@ struct ValueDFS {
   bool EdgeOnly = false;
 };
 
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
+                             const Value *B) {
+  auto *ArgA = dyn_cast_or_null<Argument>(A);
+  auto *ArgB = dyn_cast_or_null<Argument>(B);
+  if (ArgA && !ArgB)
+    return true;
+  if (ArgB && !ArgA)
+    return false;
+  if (ArgA && ArgB)
+    return ArgA->getArgNo() < ArgB->getArgNo();
+  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+}
+
 // This compares ValueDFS structures, creating OrderedBasicBlocks where
 // necessary to compare uses/defs in the same block.  Doing so allows us to walk
 // the minimum number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
-  DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap;
-  ValueDFS_Compare(
-      DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap)
-      : OBBMap(OBBMap) {}
+  OrderedInstructions &OI;
+  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
       return false;
@@ -196,23 +209,12 @@ struct ValueDFS_Compare {
     auto *ArgA = dyn_cast_or_null<Argument>(ADef);
     auto *ArgB = dyn_cast_or_null<Argument>(BDef);
 
-    if (ArgA && !ArgB)
-      return true;
-    if (ArgB && !ArgA)
-      return false;
-    if (ArgA && ArgB)
-      return ArgA->getArgNo() < ArgB->getArgNo();
+    if (ArgA || ArgB)
+      return valueComesBefore(OI, ArgA, ArgB);
 
     auto *AInst = getDefOrUser(ADef, A.U);
     auto *BInst = getDefOrUser(BDef, B.U);
-
-    auto *BB = AInst->getParent();
-    auto LookupResult = OBBMap.find(BB);
-    if (LookupResult != OBBMap.end())
-      return LookupResult->second->dominates(AInst, BInst);
-
-    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
-    return Result.first->second->dominates(AInst, BInst);
+    return valueComesBefore(OI, AInst, BInst);
   }
 };
 
@@ -547,38 +549,11 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
 void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
   // Sort OpsToRename since we are going to iterate it.
   SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
-  std::sort(OpsToRename.begin(), OpsToRename.end(), [&](const Value *A,
-                                                        const Value *B) {
-    auto *ArgA = dyn_cast_or_null<Argument>(A);
-    auto *ArgB = dyn_cast_or_null<Argument>(B);
-
-    // If A and B are args, order them based on their arg no.
-    if (ArgA && !ArgB)
-      return true;
-    if (ArgB && !ArgA)
-      return false;
-    if (ArgA && ArgB)
-      return ArgA->getArgNo() < ArgB->getArgNo();
-
-    // Else, A are B are instructions.
-    // If they belong to different BBs, order them by the dominance of BBs.
-    auto *AInst = cast<Instruction>(A);
-    auto *BInst = cast<Instruction>(B);
-    if (AInst->getParent() != BInst->getParent())
-      return DT.dominates(AInst->getParent(), BInst->getParent());
-
-    // Else, A and B belong to the same BB.
-    // Order A and B by their dominance.
-    auto *BB = AInst->getParent();
-    auto LookupResult = OBBMap.find(BB);
-    if (LookupResult != OBBMap.end())
-      return LookupResult->second->dominates(AInst, BInst);
-
-    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
-    return Result.first->second->dominates(AInst, BInst);
-  });
-
-  ValueDFS_Compare Compare(OBBMap);
+  auto Comparator = [&](const Value *A, const Value *B) {
+    return valueComesBefore(OI, A, B);
+  };
+  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  ValueDFS_Compare Compare(OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
     unsigned Counter = 0;
@@ -715,7 +690,7 @@ PredicateInfo::getValueInfo(Value *Operand) const {
 
 PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
                              AssumptionCache &AC)
-    : F(F), DT(DT), AC(AC) {
+    : F(F), DT(DT), AC(AC), OI(&DT) {
   // Push an empty operand info so that we can detect 0 as not finding one
   ValueInfos.resize(1);
   buildPredicateInfo();
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 0970c436e665..e724b0a28c32 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4781,7 +4781,7 @@ public:
   SwitchLookupTable(
       Module &M, uint64_t TableSize, ConstantInt *Offset,
       const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-      Constant *DefaultValue, const DataLayout &DL);
+      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
 
   /// Build instructions with Builder to retrieve the value at
   /// the position given by Index in the lookup table.
@@ -4835,7 +4835,7 @@ private:
 SwitchLookupTable::SwitchLookupTable(
     Module &M, uint64_t TableSize, ConstantInt *Offset,
     const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-    Constant *DefaultValue, const DataLayout &DL)
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
     : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
       LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
@@ -4943,7 +4943,7 @@ SwitchLookupTable::SwitchLookupTable(
 
   Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
                              GlobalVariable::PrivateLinkage, Initializer,
-                             "switch.table");
+                             "switch.table." + FuncName);
   Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Kind = ArrayKind;
 }
@@ -5333,7 +5333,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
     // If using a bitmask, use any value to fill the lookup table holes.
     Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
-    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL);
+    StringRef FuncName = SI->getParent()->getParent()->getName();
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+                            FuncName);
 
     Value *Result = Table.BuildLookup(TableIndex, Builder);
 
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index faa14046b1e3..ec8b0d426265 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -354,7 +354,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
   typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
       const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
   typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
-      const SCEV *, Type *);
+      const SCEV *, Type *, unsigned);
 
   OperationFunctionTy Operation;
   ExtensionFunctionTy Extension;
@@ -406,11 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
 
   const SCEV *A =
-      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u),
-                       WideTy);
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+                       WideTy, 0);
   const SCEV *B =
-      (SE->*Operation)((SE->*Extension)(LHS, WideTy),
-                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u);
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
 
   if (A != B)
     return false;
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
deleted file mode 100644
index 78453aaa16ce..000000000000
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ /dev/null
@@ -1,3282 +0,0 @@
-//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a basic-block vectorization pass. The algorithm was
-// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
-// et al. It works by looking for chains of pairable operations and then
-// pairing them.
-//
-//===----------------------------------------------------------------------===//
-
-#define BBV_NAME "bb-vectorize"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE BBV_NAME
-
-static cl::opt<bool>
-IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
-  cl::Hidden, cl::desc("Ignore target information"));
-
-static cl::opt<unsigned>
-ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
-  cl::desc("The required chain depth for vectorization"));
-
-static cl::opt<bool>
-UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
-  cl::Hidden, cl::desc("Use the chain depth requirement with"
-                       " target information"));
-
-static cl::opt<unsigned>
-SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
-  cl::desc("The maximum search distance for instruction pairs"));
-
-static cl::opt<bool>
-SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
-  cl::desc("Replicating one element to a pair breaks the chain"));
-
-static cl::opt<unsigned>
-VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
-  cl::desc("The size of the native vector registers"));
-
-static cl::opt<unsigned>
-MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
-  cl::desc("The maximum number of pairing iterations"));
-
-static cl::opt<bool>
-Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to form non-2^n-length vectors"));
-
-static cl::opt<unsigned>
-MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
-  cl::desc("The maximum number of pairable instructions per group"));
-
-static cl::opt<unsigned>
-MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
-  cl::desc("The maximum number of candidate instruction pairs per group"));
-
-static cl::opt<unsigned>
-MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
-  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
-                       " a full cycle check"));
-
-static cl::opt<bool>
-NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize boolean (i1) values"));
-
-static cl::opt<bool>
-NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize integer values"));
-
-static cl::opt<bool>
-NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point values"));
-
-// FIXME: This should default to false once pointer vector support works.
-static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
-  cl::desc("Don't try to vectorize pointer values"));
-
-static cl::opt<bool>
-NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize casting (conversion) operations"));
-
-static cl::opt<bool>
-NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point math intrinsics"));
-
-static cl::opt<bool>
-  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
-
-static cl::opt<bool>
-NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
-
-static cl::opt<bool>
-NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize select instructions"));
-
-static cl::opt<bool>
-NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize comparison instructions"));
-
-static cl::opt<bool>
-NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize getelementptr instructions"));
-
-static cl::opt<bool>
-NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize loads and stores"));
-
-static cl::opt<bool>
-AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
-  cl::desc("Only generate aligned loads and stores"));
-
-static cl::opt<bool>
-NoMemOpBoost("bb-vectorize-no-mem-op-boost",
-  cl::init(false), cl::Hidden,
-  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
-
-static cl::opt<bool>
-FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
-  cl::desc("Use a fast instruction dependency analysis"));
-
-#ifndef NDEBUG
-static cl::opt<bool>
-DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " instruction-examination process"));
-static cl::opt<bool>
-DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " candidate-selection process"));
-static cl::opt<bool>
-DebugPairSelection("bb-vectorize-debug-pair-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " pair-selection process"));
-static cl::opt<bool>
-DebugCycleCheck("bb-vectorize-debug-cycle-check",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " cycle-checking process"));
-
-static cl::opt<bool>
-PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, dump the basic block after"
-           " every pair is fused"));
-#endif
-
-STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
-
-namespace {
-  struct BBVectorize : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-
-    const VectorizeConfig Config;
-
-    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
-      : BasicBlockPass(ID), Config(C) {
-      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
-    }
-
-    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
-      : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    }
-
-    typedef std::pair<Value *, Value *> ValuePair;
-    typedef std::pair<ValuePair, int> ValuePairWithCost;
-    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
-    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
-    typedef std::pair<VPPair, unsigned> VPPairWithType;
-
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    const TargetLibraryInfo *TLI;
-    const TargetTransformInfo *TTI;
-
-    // FIXME: const correct?
-
-    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
-
-    bool getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
-
-    // FIXME: The current implementation does not account for pairs that
-    // are connected in multiple ways. For example:
-    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
-    enum PairConnectionType {
-      PairConnectionDirect,
-      PairConnectionSwap,
-      PairConnectionSplat
-    };
-
-    void computeConnectedPairs(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes);
-
-    void buildDepMap(BasicBlock &BB,
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *>& ChosenPairs);
-
-    void fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *>& ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
-
-
-    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
-
-    bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder);
-
-    bool trackUsesOfI(DenseSet<Value *> &Users,
-                      AliasSetTracker &WriteSet, Instruction *I,
-                      Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
-
-  void computePairsConnectedTo(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             ValuePair P);
-
-    bool pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = nullptr,
-             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
-
-    bool pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
-             DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG,
-             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-             bool UseCycleCheck);
-
-    void buildInitialDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
-
-    void findBestDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-             bool UseCycleCheck);
-
-    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
-
-    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask);
-
-    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
-                     Instruction *J);
-
-    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
-                       unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
-                       unsigned IdxOff = 0);
-
-    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ);
-
-    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ);
-
-    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2);
-
-    void collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I);
-
-    void collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs);
-
-    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J);
-
-    void moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J);
-
-    bool vectorizeBB(BasicBlock &BB) {
-      if (skipBasicBlock(BB))
-        return false;
-      if (!DT->isReachableFromEntry(&BB)) {
-        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
-              " in " << BB.getParent()->getName() << "\n");
-        return false;
-      }
-
-      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
-
-      bool changed = false;
-      // Iterate a sufficient number of times to merge types of size 1 bit,
-      // then 2 bits, then 4, etc. up to half of the target vector width of the
-      // target vector register.
-      unsigned n = 1;
-      for (unsigned v = 2;
-           (TTI || v <= Config.VectorBits) &&
-           (!Config.MaxIter || n <= Config.MaxIter);
-           v *= 2, ++n) {
-        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
-              " for " << BB.getName() << " in " <<
-              BB.getParent()->getName() << "...\n");
-        if (vectorizePairs(BB))
-          changed = true;
-        else
-          break;
-      }
-
-      if (changed && !Pow2LenOnly) {
-        ++n;
-        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
-          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
-                n << " for " << BB.getName() << " in " <<
-                BB.getParent()->getName() << "...\n");
-          if (!vectorizePairs(BB, true)) break;
-        }
-      }
-
-      DEBUG(dbgs() << "BBV: done!\n");
-      return changed;
-    }
-
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      // OptimizeNone check deferred to vectorizeBB().
-
-      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                      *BB.getParent());
-
-      return vectorizeBB(BB);
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.setPreservesCFG();
-    }
-
-    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
-      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
-             "Cannot form vector from incompatible scalar types");
-      Type *STy = ElemTy->getScalarType();
-
-      unsigned numElem;
-      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        numElem = VTy->getNumElements();
-      } else {
-        numElem = 1;
-      }
-
-      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
-        numElem += VTy->getNumElements();
-      } else {
-        numElem += 1;
-      }
-
-      return VectorType::get(STy, numElem);
-    }
-
-    static inline void getInstructionTypes(Instruction *I,
-                                           Type *&T1, Type *&T2) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // For stores, it is the value type, not the pointer type that matters
-        // because the value is what will come from a vector register.
-
-        Value *IVal = SI->getValueOperand();
-        T1 = IVal->getType();
-      } else {
-        T1 = I->getType();
-      }
-
-      if (CastInst *CI = dyn_cast<CastInst>(I))
-        T2 = CI->getSrcTy();
-      else
-        T2 = T1;
-
-      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-        T2 = SI->getCondition()->getType();
-      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        T2 = SI->getOperand(0)->getType();
-      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
-        T2 = CI->getOperand(0)->getType();
-      }
-    }
-
-    // Returns the weight associated with the provided value. A chain of
-    // candidate pairs has a length given by the sum of the weights of its
-    // members (one weight per pair; the weight of each member of the pair
-    // is assumed to be the same). This length is then compared to the
-    // chain-length threshold to determine if a given chain is significant
-    // enough to be vectorized. The length is also used in comparing
-    // candidate chains where longer chains are considered to be better.
-    // Note: when this function returns 0, the resulting instructions are
-    // not actually fused.
-    inline size_t getDepthFactor(Value *V) {
-      // InsertElement and ExtractElement have a depth factor of zero. This is
-      // for two reasons: First, they cannot be usefully fused. Second, because
-      // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the dags in the next iteration. Thus, giving them a
-      // weight of zero allows the pass to essentially ignore them in
-      // subsequent iterations when looking for vectorization opportunities
-      // while still tracking dependency chains that flow through those
-      // instructions.
-      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
-        return 0;
-
-      // Give a load or store half of the required depth so that load/store
-      // pairs will vectorize.
-      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
-        return Config.ReqChainDepth/2;
-
-      return 1;
-    }
-
-    // Returns the cost of the provided instruction using TTI.
-    // This does not handle loads and stores.
-    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          const Instruction *I = nullptr) {
-      switch (Opcode) {
-      default: break;
-      case Instruction::GetElementPtr:
-        // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the instruction addressing mode. At the moment we don't
-        // generate vector GEPs.
-        return 0;
-      case Instruction::Br:
-        return TTI->getCFInstrCost(Opcode);
-      case Instruction::PHI:
-        return 0;
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
-      case Instruction::Select:
-      case Instruction::ICmp:
-      case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast:
-      case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2, I);
-      }
-
-      return 1;
-    }
-
-    // This determines the relative offset of two loads or stores, returning
-    // true if the offset could be determined to be some constant value.
-    // For example, if OffsetInElmts == 1, then J accesses the memory directly
-    // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J.
-    bool getPairPtrInfo(Instruction *I, Instruction *J,
-        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
-        unsigned &IAddressSpace, unsigned &JAddressSpace,
-        int64_t &OffsetInElmts, bool ComputeOffset = true) {
-      OffsetInElmts = 0;
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        LoadInst *LJ = cast<LoadInst>(J);
-        IPtr = LI->getPointerOperand();
-        JPtr = LJ->getPointerOperand();
-        IAlignment = LI->getAlignment();
-        JAlignment = LJ->getAlignment();
-        IAddressSpace = LI->getPointerAddressSpace();
-        JAddressSpace = LJ->getPointerAddressSpace();
-      } else {
-        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
-        IPtr = SI->getPointerOperand();
-        JPtr = SJ->getPointerOperand();
-        IAlignment = SI->getAlignment();
-        JAlignment = SJ->getAlignment();
-        IAddressSpace = SI->getPointerAddressSpace();
-        JAddressSpace = SJ->getPointerAddressSpace();
-      }
-
-      if (!ComputeOffset)
-        return true;
-
-      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
-      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
-
-      // If this is a trivial offset, then we'll get something like
-      // 1*sizeof(type). With target data, which we need anyway, this will get
-      // constant folded into a number.
-      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
-      if (const SCEVConstant *ConstOffSCEV =
-            dyn_cast<SCEVConstant>(OffsetSCEV)) {
-        ConstantInt *IntOff = ConstOffSCEV->getValue();
-        int64_t Offset = IntOff->getSExtValue();
-        const DataLayout &DL = I->getModule()->getDataLayout();
-        Type *VTy = IPtr->getType()->getPointerElementType();
-        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
-
-        Type *VTy2 = JPtr->getType()->getPointerElementType();
-        if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
-          OffsetInElmts = Offset/VTy2TSS;
-          return (std::abs(Offset) % VTy2TSS) == 0;
-        }
-
-        OffsetInElmts = Offset/VTyTSS;
-        return (std::abs(Offset) % VTyTSS) == 0;
-      }
-
-      return false;
-    }
-
-    // Returns true if the provided CallInst represents an intrinsic that can
-    // be vectorized.
-    bool isVectorizableIntrinsic(CallInst* I) {
-      Function *F = I->getCalledFunction();
-      if (!F) return false;
-
-      Intrinsic::ID IID = F->getIntrinsicID();
-      if (!IID) return false;
-
-      switch(IID) {
-      default:
-        return false;
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::log:
-      case Intrinsic::log2:
-      case Intrinsic::log10:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::pow:
-      case Intrinsic::round:
-      case Intrinsic::copysign:
-      case Intrinsic::ceil:
-      case Intrinsic::nearbyint:
-      case Intrinsic::rint:
-      case Intrinsic::trunc:
-      case Intrinsic::floor:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-        return Config.VectorizeMath;
-      case Intrinsic::bswap:
-      case Intrinsic::ctpop:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-        return Config.VectorizeBitManipulations;
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        return Config.VectorizeFMA;
-      }
-    }
-
-    bool isPureIEChain(InsertElementInst *IE) {
-      InsertElementInst *IENext = IE;
-      do {
-        if (!isa<UndefValue>(IENext->getOperand(0)) &&
-            !isa<InsertElementInst>(IENext->getOperand(0))) {
-          return false;
-        }
-      } while ((IENext =
-                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
-
-      return true;
-    }
-  };
-
-  // This function implements one vectorization iteration on the provided
-  // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
-    bool ShouldContinue;
-    BasicBlock::iterator Start = BB.getFirstInsertionPt();
-
-    std::vector<Value *> AllPairableInsts;
-    DenseMap<Value *, Value *> AllChosenPairs;
-    DenseSet<ValuePair> AllFixedOrderPairs;
-    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
-                                                 AllConnectedPairDeps;
-
-    do {
-      std::vector<Value *> PairableInsts;
-      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
-      DenseSet<ValuePair> FixedOrderPairs;
-      DenseMap<ValuePair, int> CandidatePairCostSavings;
-      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         FixedOrderPairs,
-                                         CandidatePairCostSavings,
-                                         PairableInsts, NonPow2Len);
-      if (PairableInsts.empty()) continue;
-
-      // Build the candidate pair set for faster lookups.
-      DenseSet<ValuePair> CandidatePairsSet;
-      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
-           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
-        for (std::vector<Value *>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          CandidatePairsSet.insert(ValuePair(I->first, *J));
-
-      // Now we have a map of all of the pairable instructions and we need to
-      // select the best possible pairing. A good pairing is one such that the
-      // users of the pair are also paired. This defines a (directed) forest
-      // over the pairs such that two pairs are connected iff the second pair
-      // uses the first.
-
-      // Note that it only matters that both members of the second pair use some
-      // element of the first pair (to allow for splatting).
-
-      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
-                                                   ConnectedPairDeps;
-      DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
-                            PairableInsts, ConnectedPairs, PairConnectionTypes);
-      if (ConnectedPairs.empty()) continue;
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          ConnectedPairDeps[*J].push_back(I->first);
-
-      // Build the pairable-instruction dependency map
-      DenseSet<ValuePair> PairableInstUsers;
-      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
-
-      // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest dag meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that dag
-      // and remove them from the list of available pairings and pairable
-      // variables.
-
-      DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairsSet,
-        CandidatePairCostSavings,
-        PairableInsts, FixedOrderPairs, PairConnectionTypes,
-        ConnectedPairs, ConnectedPairDeps,
-        PairableInstUsers, ChosenPairs);
-
-      if (ChosenPairs.empty()) continue;
-      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
-                              PairableInsts.end());
-      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
-
-      // Only for the chosen pairs, propagate information on fixed-order pairs,
-      // pair connections, and their types to the data structures used by the
-      // pair fusion procedures.
-      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
-           IE = ChosenPairs.end(); I != IE; ++I) {
-        if (FixedOrderPairs.count(*I))
-          AllFixedOrderPairs.insert(*I);
-        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
-          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
-
-        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
-             J != IE; ++J) {
-          DenseMap<VPPair, unsigned>::iterator K =
-            PairConnectionTypes.find(VPPair(*I, *J));
-          if (K != PairConnectionTypes.end()) {
-            AllPairConnectionTypes.insert(*K);
-          } else {
-            K = PairConnectionTypes.find(VPPair(*J, *I));
-            if (K != PairConnectionTypes.end())
-              AllPairConnectionTypes.insert(*K);
-          }
-        }
-      }
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-          JE = I->second.end(); J != JE; ++J)
-          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
-            AllConnectedPairs[I->first].push_back(*J);
-            AllConnectedPairDeps[*J].push_back(I->first);
-          }
-    } while (ShouldContinue);
-
-    if (AllChosenPairs.empty()) return false;
-    NumFusedOps += AllChosenPairs.size();
-
-    // A set of pairs has now been selected. It is now necessary to replace the
-    // paired instructions with vector instructions. For this procedure each
-    // operand must be replaced with a vector operand. This vector is formed
-    // by using build_vector on the old operands. The replaced values are then
-    // replaced with a vector_extract on the result.  Subsequent optimization
-    // passes should coalesce the build/extract combinations.
-
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
-                    AllPairConnectionTypes,
-                    AllConnectedPairs, AllConnectedPairDeps);
-
-    // It is important to cleanup here so that future iterations of this
-    // function have less work to do.
-    (void)SimplifyInstructionsInBlock(&BB, TLI);
-    return true;
-  }
-
-  // This function returns true if the provided instruction is capable of being
-  // fused into a vector instruction. This determination is based only on the
-  // type and other attributes of the instruction.
-  bool BBVectorize::isInstVectorizable(Instruction *I,
-                                         bool &IsSimpleLoadStore) {
-    IsSimpleLoadStore = false;
-
-    if (CallInst *C = dyn_cast<CallInst>(I)) {
-      if (!isVectorizableIntrinsic(C))
-        return false;
-    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-      // Vectorize simple loads if possbile:
-      IsSimpleLoadStore = L->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
-      // Vectorize simple stores if possbile:
-      IsSimpleLoadStore = S->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
-      // We can vectorize casts, but not casts of pointer types, etc.
-      if (!Config.VectorizeCasts)
-        return false;
-
-      Type *SrcTy = C->getSrcTy();
-      if (!SrcTy->isSingleValueType())
-        return false;
-
-      Type *DestTy = C->getDestTy();
-      if (!DestTy->isSingleValueType())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-      if (!Config.VectorizeSelect)
-        return false;
-      // We can vectorize a select if either all operands are scalars,
-      // or all operands are vectors. Trying to "widen" a select between
-      // vectors that has a scalar condition results in a malformed select.
-      // FIXME: We could probably be smarter about this by rewriting the select
-      // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() ==
-              SI->getTrueValue()->getType()->isVectorTy());
-    } else if (isa<CmpInst>(I)) {
-      if (!Config.VectorizeCmp)
-        return false;
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
-      if (!Config.VectorizeGEP)
-        return false;
-
-      // Currently, vector GEPs exist only with one index.
-      if (G->getNumIndices() != 1)
-        return false;
-    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
-        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
-      return false;
-    }
-
-    Type *T1, *T2;
-    getInstructionTypes(I, T1, T2);
-
-    // Not every type can be vectorized...
-    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
-        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
-      return false;
-
-    if (T1->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (T2->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (!Config.VectorizeFloats
-        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
-      return false;
-
-    // Don't vectorize target-specific types.
-    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
-      return false;
-    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
-      return false;
-
-    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
-                                      T2->getScalarType()->isPointerTy()))
-      return false;
-
-    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
-      return false;
-
-    return true;
-  }
-
-  // This function returns true if the two provided instructions are compatible
-  // (meaning that they can be fused into a vector instruction). This assumes
-  // that I has already been determined to be vectorizable and that J is not
-  // in the use dag of I.
-  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder) {
-    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
-                     " <-> " << *J << "\n");
-
-    CostSavings = 0;
-    FixedOrder = 0;
-
-    // Loads and stores can be merged if they have different alignments,
-    // but are otherwise the same.
-    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
-                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
-      return false;
-
-    Type *IT1, *IT2, *JT1, *JT2;
-    getInstructionTypes(I, IT1, IT2);
-    getInstructionTypes(J, JT1, JT2);
-    unsigned MaxTypeBits = std::max(
-      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
-      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (!TTI && MaxTypeBits > Config.VectorBits)
-      return false;
-
-    // FIXME: handle addsub-type operations!
-
-    if (IsSimpleLoadStore) {
-      Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-      int64_t OffsetInElmts = 0;
-      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
-          std::abs(OffsetInElmts) == 1) {
-        FixedOrder = (int) OffsetInElmts;
-        unsigned BottomAlignment = IAlignment;
-        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
-        Type *aTypeI = isa<StoreInst>(I) ?
-          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
-        Type *aTypeJ = isa<StoreInst>(J) ?
-          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
-        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
-
-        if (Config.AlignedOnly) {
-          // An aligned load or store is possible only if the instruction
-          // with the lower offset has an alignment suitable for the
-          // vector type.
-          const DataLayout &DL = I->getModule()->getDataLayout();
-          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
-          if (BottomAlignment < VecAlignment)
-            return false;
-        }
-
-        if (TTI) {
-          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
-                                                IAlignment, IAddressSpace);
-          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
-                                                JAlignment, JAddressSpace);
-          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
-                                                BottomAlignment,
-                                                IAddressSpace);
-
-          ICost += TTI->getAddressComputationCost(aTypeI);
-          JCost += TTI->getAddressComputationCost(aTypeJ);
-          VCost += TTI->getAddressComputationCost(VType);
-
-          if (VCost > ICost + JCost)
-            return false;
-
-          // We don't want to fuse to a type that will be split, even
-          // if the two input types will also be split and there is no other
-          // associated cost.
-          unsigned VParts = TTI->getNumberOfParts(VType);
-          if (VParts > 1)
-            return false;
-          else if (!VParts && VCost == ICost + JCost)
-            return false;
-
-          CostSavings = ICost + JCost - VCost;
-        }
-      } else {
-        return false;
-      }
-    } else if (TTI) {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
-
-      // On some targets (example X86) the cost of a vector shift may vary
-      // depending on whether the second operand is a Uniform or
-      // NonUniform Constant.
-      switch (I->getOpcode()) {
-      default : break;
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-
-        // If both I and J are scalar shifts by constant, then the
-        // merged vector shift count would be either a constant splat value
-        // or a non-uniform vector of constants.
-        if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
-          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
-            Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
-                               TargetTransformInfo::OK_NonUniformConstantValue;
-        } else {
-          // Check for a splat of a constant or for a non uniform vector
-          // of constants.
-          Value *IOp = I->getOperand(1);
-          Value *JOp = J->getOperand(1);
-          if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
-              (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
-            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
-            if (SplatValue != nullptr &&
-                SplatValue == cast<Constant>(JOp)->getSplatValue())
-              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
-          }
-        }
-      }
-
-      // Note that this procedure is incorrect for insert and extract element
-      // instructions (because combining these often results in a shuffle),
-      // but this cost is ignored (because insert and extract element
-      // instructions are assigned a zero depth factor and are not really
-      // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
-
-      if (VCost > ICost + JCost)
-        return false;
-
-      // We don't want to fuse to a type that will be split, even
-      // if the two input types will also be split and there is no other
-      // associated cost.
-      unsigned VParts1 = TTI->getNumberOfParts(VT1),
-               VParts2 = TTI->getNumberOfParts(VT2);
-      if (VParts1 > 1 || VParts2 > 1)
-        return false;
-      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
-        return false;
-
-      CostSavings = ICost + JCost - VCost;
-    }
-
-    // The powi,ctlz,cttz intrinsics are special because only the first
-    // argument is vectorized, the second arguments must be equal.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    Function *FI;
-    if (CI && (FI = CI->getCalledFunction())) {
-      Intrinsic::ID IID = FI->getIntrinsicID();
-      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-          IID == Intrinsic::cttz) {
-        Value *A1I = CI->getArgOperand(1),
-              *A1J = cast<CallInst>(J)->getArgOperand(1);
-        const SCEV *A1ISCEV = SE->getSCEV(A1I),
-                   *A1JSCEV = SE->getSCEV(A1J);
-        return (A1ISCEV == A1JSCEV);
-      }
-
-      if (IID && TTI) {
-        FastMathFlags FMFCI;
-        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
-          FMFCI = FPMOCI->getFastMathFlags();
-        SmallVector<Value *, 4> IArgs(CI->arg_operands());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
-
-        CallInst *CJ = cast<CallInst>(J);
-
-        FastMathFlags FMFCJ;
-        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
-          FMFCJ = FPMOCJ->getFastMathFlags();
-
-        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
-
-        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
-               "Intrinsic argument counts differ");
-        SmallVector<Type*, 4> Tys;
-        SmallVector<Value *, 4> VecArgs;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1) {
-            Tys.push_back(CI->getArgOperand(i)->getType());
-            VecArgs.push_back(CI->getArgOperand(i));
-          }
-          else {
-            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
-                                            CJ->getArgOperand(i)->getType()));
-            // Add both operands, and then count their scalarization overhead
-            // with VF 1.
-            VecArgs.push_back(CI->getArgOperand(i));
-            VecArgs.push_back(CJ->getArgOperand(i));
-          }
-        }
-
-        // Compute the scalarization cost here with the original operands (to
-        // check for uniqueness etc), and then call getIntrinsicInstrCost()
-        // with the constructed vector types.
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned ScalarizationCost = 0;
-        if (!RetTy->isVoidTy())
-          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
-        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
-
-        FastMathFlags FMFV = FMFCI;
-        FMFV &= FMFCJ;
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
-                                                    ScalarizationCost);
-
-        if (VCost > ICost + JCost)
-          return false;
-
-        // We don't want to fuse to a type that will be split, even
-        // if the two input types will also be split and there is no other
-        // associated cost.
-        unsigned RetParts = TTI->getNumberOfParts(RetTy);
-        if (RetParts > 1)
-          return false;
-        else if (!RetParts && VCost == ICost + JCost)
-          return false;
-
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (!Tys[i]->isVectorTy())
-            continue;
-
-          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
-          if (NumParts > 1)
-            return false;
-          else if (!NumParts && VCost == ICost + JCost)
-            return false;
-        }
-
-        CostSavings = ICost + JCost - VCost;
-      }
-    }
-
-    return true;
-  }
-
-  // Figure out whether or not J uses I and update the users and write-set
-  // structures associated with I. Specifically, Users represents the set of
-  // instructions that depend on I. WriteSet represents the set
-  // of memory locations that are dependent on I. If UpdateUsers is true,
-  // and J uses I, then Users is updated to contain J and WriteSet is updated
-  // to contain any memory locations to which J writes. The function returns
-  // true if J uses I. By default, alias analysis is used to determine
-  // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed map
-  // where the key is the memory-based user instruction and the value is
-  // the instruction to be compared with I. So, if LoadMoveSet is provided,
-  // then the alias analysis is not used. This is necessary because this
-  // function is called during the process of moving instructions during
-  // vectorization and the results of the alias analysis are not stable during
-  // that process.
-  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
-                       AliasSetTracker &WriteSet, Instruction *I,
-                       Instruction *J, bool UpdateUsers,
-                       DenseSet<ValuePair> *LoadMoveSetPairs) {
-    bool UsesI = false;
-
-    // This instruction may already be marked as a user due, for example, to
-    // being a member of a selected pair.
-    if (Users.count(J))
-      UsesI = true;
-
-    if (!UsesI)
-      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
-           JU != JE; ++JU) {
-        Value *V = *JU;
-        if (I == V || Users.count(V)) {
-          UsesI = true;
-          break;
-        }
-      }
-    if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSetPairs) {
-        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
-      } else {
-        for (AliasSetTracker::iterator W = WriteSet.begin(),
-             WE = WriteSet.end(); W != WE; ++W) {
-          if (W->aliasesUnknownInst(J, *AA)) {
-            UsesI = true;
-            break;
-          }
-        }
-      }
-    }
-
-    if (UsesI && UpdateUsers) {
-      if (J->mayWriteToMemory()) WriteSet.add(J);
-      Users.insert(J);
-    }
-
-    return UsesI;
-  }
-
-  // This function iterates over all instruction pairs in the provided
-  // basic block and collects all candidate pairs for vectorization.
-  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
-    size_t TotalPairs = 0;
-    BasicBlock::iterator E = BB.end();
-    if (Start == E) return false;
-
-    bool ShouldContinue = false, IAfterStart = false;
-    for (BasicBlock::iterator I = Start++; I != E; ++I) {
-      if (I == Start) IAfterStart = true;
-
-      bool IsSimpleLoadStore;
-      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
-        continue;
-
-      // Look for an instruction with which to pair instruction *I...
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      bool JAfterStart = IAfterStart;
-      BasicBlock::iterator J = std::next(I);
-      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start)
-          JAfterStart = true;
-
-        // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
-        if (Config.FastDep) {
-          // Note: For this heuristic to be effective, independent operations
-          // must tend to be intermixed. This is likely to be true from some
-          // kinds of grouped loop unrolling (but not the generic LLVM pass),
-          // but otherwise may require some kind of reordering pass.
-
-          // When using fast dependency analysis,
-          // stop searching after first use:
-          if (UsesI) break;
-        } else {
-          if (UsesI) continue;
-        }
-
-        // J does not use I, and comes before the first use of I, so it can be
-        // merged with I if the instructions are compatible.
-        int CostSavings, FixedOrder;
-        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
-                                CostSavings, FixedOrder))
-          continue;
-
-        // J is a candidate for merging with I.
-        if (PairableInsts.empty() ||
-            PairableInsts[PairableInsts.size() - 1] != &*I) {
-          PairableInsts.push_back(&*I);
-        }
-
-        CandidatePairs[&*I].push_back(&*J);
-        ++TotalPairs;
-        if (TTI)
-          CandidatePairCostSavings.insert(
-              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
-
-        if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(&*I, &*J));
-        else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(&*J, &*I));
-
-        // The next call to this function must start after the last instruction
-        // selected during this invocation.
-        if (JAfterStart) {
-          Start = std::next(J);
-          IAfterStart = JAfterStart = false;
-        }
-
-        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
-                     << *I << " <-> " << *J << " (cost savings: " <<
-                     CostSavings << ")\n");
-
-        // If we have already found too many pairs, break here and this function
-        // will be called again starting after the last instruction selected
-        // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts ||
-            TotalPairs >= Config.MaxPairs) {
-          ShouldContinue = true;
-          break;
-        }
-      }
-
-      if (ShouldContinue)
-        break;
-    }
-
-    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
-           << " instructions with candidate pairs\n");
-
-    return ShouldContinue;
-  }
-
-  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
-  // it looks for pairs such that both members have an input which is an
-  // output of PI or PJ.
-  void BBVectorize::computePairsConnectedTo(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                  ValuePair P) {
-    StoreInst *SI, *SJ;
-
-    // For each possible pairing for this variable, look at the uses of
-    // the first value...
-    for (Value::user_iterator I = P.first->user_begin(),
-                              E = P.first->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI)) {
-        // A pair cannot be connected to a load because the load only takes one
-        // operand (the address) and it is a scalar even after vectorization.
-        continue;
-      } else if ((SI = dyn_cast<StoreInst>(UI)) &&
-                 P.first == SI->getPointerOperand()) {
-        // Similarly, a pair cannot be connected to a store through its
-        // pointer operand.
-        continue;
-      }
-
-      // For each use of the first variable, look for uses of the second
-      // variable...
-      for (User *UJ : P.second->users()) {
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        // Look for <I, J>:
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
-        }
-
-        // Look for <J, I>:
-        if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
-          VPPair VP(P, ValuePair(UJ, UI));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
-        }
-      }
-
-      if (Config.SplatBreaksChain) continue;
-      // Look for cases where just the first value in the pair is used by
-      // both members of another pair (splatting).
-      for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.first == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-
-    if (Config.SplatBreaksChain) return;
-    // Look for cases where just the second value in the pair is used by
-    // both members of another pair (splatting).
-    for (Value::user_iterator I = P.second->user_begin(),
-                              E = P.second->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI))
-        continue;
-      else if ((SI = dyn_cast<StoreInst>(UI)) &&
-               P.second == SI->getPointerOperand())
-        continue;
-
-      for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-  }
-
-  // This function figures out which pairs are connected.  Two pairs are
-  // connected if some output of the first pair forms an input to both members
-  // of the second pair.
-  void BBVectorize::computeConnectedPairs(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PE = PairableInsts.end(); PI != PE; ++PI) {
-      DenseMap<Value *, std::vector<Value *> >::iterator PP =
-        CandidatePairs.find(*PI);
-      if (PP == CandidatePairs.end())
-        continue;
-
-      for (std::vector<Value *>::iterator P = PP->second.begin(),
-           E = PP->second.end(); P != E; ++P)
-        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
-                                PairableInsts, ConnectedPairs,
-                                PairConnectionTypes, ValuePair(*PI, *P));
-    }
-
-    DEBUG(size_t TotalPairs = 0;
-          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
-               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
-            TotalPairs += I->second.size();
-          dbgs() << "BBV: found " << TotalPairs
-                 << " pair connections.\n");
-  }
-
-  // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use dag of A. If B is in the use dag of A, then B
-  // depends on the output of A.
-  void BBVectorize::buildDepMap(
-                      BasicBlock &BB,
-                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &PairableInstUsers) {
-    DenseSet<Value *> IsInPair;
-    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
-         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
-      IsInPair.insert(C->first);
-      IsInPair.insert(C->second.begin(), C->second.end());
-    }
-
-    // Iterate through the basic block, recording all users of each
-    // pairable instruction.
-
-    BasicBlock::iterator E = BB.end(), EL =
-      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
-    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(&*I) == IsInPair.end())
-        continue;
-
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
-        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
-
-        if (J == EL)
-          break;
-      }
-
-      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U) {
-        if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(&*I, *U));
-      }
-
-      if (I == EL)
-        break;
-    }
-  }
-
-  // Returns true if an input to pair P is an output of pair Q and also an
-  // input of pair Q is an output of pair P. If this is the case, then these
-  // two pairs cannot be simultaneously fused.
-  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
-             DenseSet<VPPair> *PairableInstUserPairSet) {
-    // Two pairs are in conflict if they are mutual Users of eachother.
-    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.second));
-    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.second));
-    if (PairableInstUserMap) {
-      // FIXME: The expensive part of the cycle check is not so much the cycle
-      // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure.
-      if (PUsesQ) {
-        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
-          (*PairableInstUserMap)[Q].push_back(P);
-      }
-      if (QUsesP) {
-        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
-          (*PairableInstUserMap)[P].push_back(Q);
-      }
-    }
-
-    return (QUsesP && PUsesQ);
-  }
-
-  // This function walks the use graph of current pairs to see if, starting
-  // from P, the walk returns to P.
-  bool BBVectorize::pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<ValuePair> &CurrentPairs) {
-    DEBUG(if (DebugCycleCheck)
-            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
-                   << *P.second << "\n");
-    // A lookup table of visisted pairs is kept because the PairableInstUserMap
-    // contains non-direct associations.
-    DenseSet<ValuePair> Visited;
-    SmallVector<ValuePair, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(P);
-    do {
-      ValuePair QTop = Q.pop_back_val();
-      Visited.insert(QTop);
-
-      DEBUG(if (DebugCycleCheck)
-              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
-                     << *QTop.second << "\n");
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        PairableInstUserMap.find(QTop);
-      if (QQ == PairableInstUserMap.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
-           CE = QQ->second.end(); C != CE; ++C) {
-        if (*C == P) {
-          DEBUG(dbgs()
-                 << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << QTop.first << " <-> " << C->second << "\n");
-          return true;
-        }
-
-        if (CurrentPairs.count(*C) && !Visited.count(*C))
-          Q.push_back(*C);
-      }
-    } while (!Q.empty());
-
-    return false;
-  }
-
-  // This function builds the initial dag of connected pairs with the
-  // pair J at the root.
-  void BBVectorize::buildInitialDAGFor(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseSet<ValuePair> &PairableInstUsers,
-                  DenseMap<Value *, Value *> &ChosenPairs,
-                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a DAG. The DAG
-    // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the DAG and the maximum depth of the DAG.
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.back();
-
-      // Push each child onto the queue:
-      bool MoreChildren = false;
-      size_t MaxChildDepth = QTop.second;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
-             ke = QQ->second.end(); k != ke; ++k) {
-          // Make sure that this child pair is still a candidate:
-          if (CandidatePairsSet.count(*k)) {
-            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
-            if (C == DAG.end()) {
-              size_t d = getDepthFactor(k->first);
-              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
-              MoreChildren = true;
-            } else {
-              MaxChildDepth = std::max(MaxChildDepth, C->second);
-            }
-          }
-        }
-
-      if (!MoreChildren) {
-        // Record the current pair as part of the DAG:
-        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
-        Q.pop_back();
-      }
-    } while (!Q.empty());
-  }
-
-  // Given some initial dag, prune it by removing conflicting pairs (pairs
-  // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              std::vector<Value *> &PairableInsts,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseMap<ValuePair, size_t> &DAG,
-              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-              bool UseCycleCheck) {
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedDAG.insert(QTop.first);
-
-      // Visit each child, pruning as necessary...
-      SmallVector<ValuePairWithDepth, 8> BestChildren;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ == ConnectedPairs.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
-           KE = QQ->second.end(); K != KE; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
-        if (C == DAG.end()) continue;
-
-        // This child is in the DAG, now we need to make sure it is the
-        // best of any conflicting children. There could be multiple
-        // conflicting children, so first, determine if we're keeping
-        // this child, then delete conflicting children as necessary.
-
-        // It is also necessary to guard against pairing-induced
-        // dependencies. Consider instructions a .. x .. y .. b
-        // such that (a,b) are to be fused and (x,y) are to be fused
-        // but a is an input to x and b is an output from y. This
-        // means that y cannot be moved after b but x must be moved
-        // after b for (a,b) to be fused. In other words, after
-        // fusing (a,b) we have y .. a/b .. x where y is an input
-        // to a/b and x is an output to a/b: x and y can no longer
-        // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the DAG is not
-        // both an input and output of an already-selected pair.
-
-        // Pairing-induced dependencies can also form from more complicated
-        // cycles. The pair vs. pair conflicts are easy to check, and so
-        // that is done explicitly for "fast rejection", and because for
-        // child vs. child conflicts, we may prefer to keep the current
-        // pair in preference to the already-selected child.
-        DenseSet<ValuePair> CurrentPairs;
-
-        bool CanAdd = true;
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(), E2 = BestChildren.end();
-             C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            if (C2->second >= C->second) {
-              CanAdd = false;
-              break;
-            }
-
-            CurrentPairs.insert(C2->first);
-          }
-        }
-        if (!CanAdd) continue;
-
-        // Even worse, this child could conflict with another node already
-        // selected for the DAG. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
-             E2 = PrunedDAG.end(); T != E2; ++T) {
-          if (T->first == C->first.first ||
-              T->first == C->first.second ||
-              T->second == C->first.first ||
-              T->second == C->first.second ||
-              pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*T);
-        }
-        if (!CanAdd) continue;
-
-        // And check the queue too...
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
-             E2 = Q.end(); C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(C2->first);
-        }
-        if (!CanAdd) continue;
-
-        // Last but not least, check for a conflict with any of the
-        // already-chosen pairs.
-        for (DenseMap<Value *, Value *>::iterator C2 =
-              ChosenPairs.begin(), E2 = ChosenPairs.end();
-             C2 != E2; ++C2) {
-          if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*C2);
-        }
-        if (!CanAdd) continue;
-
-        // To check for non-trivial cycles formed by the addition of the
-        // current pair we've formed a list of all relevant pairs, now use a
-        // graph walk to check for a cycle. We start from the current pair and
-        // walk the use dag to see if we again reach the current pair. If we
-        // do, then the current pair is rejected.
-
-        // FIXME: It may be more efficient to use a topological-ordering
-        // algorithm to improve the cycle check. This should be investigated.
-        if (UseCycleCheck &&
-            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
-          continue;
-
-        // This child can be added, but we may have chosen it in preference
-        // to an already-selected child. Check for this here, and if a
-        // conflict is found, then remove the previously-selected child
-        // before adding this one in its place.
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(); C2 != BestChildren.end();) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers))
-            C2 = BestChildren.erase(C2);
-          else
-            ++C2;
-        }
-
-        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
-      }
-
-      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
-            = BestChildren.begin(), E2 = BestChildren.end();
-           C != E2; ++C) {
-        size_t DepthF = getDepthFactor(C->first.first);
-        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
-      }
-    } while (!Q.empty());
-  }
-
-  // This function finds the best dag of mututally-compatible connected
-  // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              DenseSet<ValuePair> &CandidatePairsSet,
-              DenseMap<ValuePair, int> &CandidatePairCostSavings,
-              std::vector<Value *> &PairableInsts,
-              DenseSet<ValuePair> &FixedOrderPairs,
-              DenseMap<VPPair, unsigned> &PairConnectionTypes,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-              bool UseCycleCheck) {
-    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
-         J != JE; ++J) {
-      ValuePair IJ(II, *J);
-      if (!CandidatePairsSet.count(IJ))
-        continue;
-
-      // Before going any further, make sure that this pair does not
-      // conflict with any already-selected pairs (see comment below
-      // near the DAG pruning for more details).
-      DenseSet<ValuePair> ChosenPairSet;
-      bool DoesConflict = false;
-      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
-           E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : nullptr,
-                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
-          DoesConflict = true;
-          break;
-        }
-
-        ChosenPairSet.insert(*C);
-      }
-      if (DoesConflict) continue;
-
-      if (UseCycleCheck &&
-          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
-        continue;
-
-      DenseMap<ValuePair, size_t> DAG;
-      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
-                          PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, DAG, IJ);
-
-      // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned DAG.
-      size_t MaxDepth = DAG.lookup(IJ);
-
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-                   MaxDepth << " and size " << DAG.size() << "\n");
-
-      // At this point the DAG has been constructed, but, may contain
-      // contradictory children (meaning that different children of
-      // some dag node may be attempting to fuse the same instruction).
-      // So now we walk the dag again, in the case of a conflict,
-      // keep only the child with the largest depth. To break a tie,
-      // favor the first child.
-
-      DenseSet<ValuePair> PrunedDAG;
-      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap,
-                   PairableInstUserPairSet,
-                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
-
-      int EffSize = 0;
-      if (TTI) {
-        DenseSet<Value *> PrunedDAGInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          PrunedDAGInstrs.insert(S->first);
-          PrunedDAGInstrs.insert(S->second);
-        }
-
-        // The set of pairs that have already contributed to the total cost.
-        DenseSet<ValuePair> IncomingPairs;
-
-        // If the cost model were perfect, this might not be necessary; but we
-        // need to make sure that we don't get stuck vectorizing our own
-        // shuffle chains.
-        bool HasNontrivialInsts = false;
-
-        // The node weights represent the cost savings associated with
-        // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          if (!isa<ShuffleVectorInst>(S->first) &&
-              !isa<InsertElementInst>(S->first) &&
-              !isa<ExtractElementInst>(S->first))
-            HasNontrivialInsts = true;
-
-          bool FlipOrder = false;
-
-          if (getDepthFactor(S->first)) {
-            int ESContrib = CandidatePairCostSavings.find(*S)->second;
-            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
-                   << *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-            EffSize += ESContrib;
-          }
-
-          // The edge weights contribute in a negative sense: they represent
-          // the cost of shuffles.
-          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
-            ConnectedPairDeps.find(*S);
-          if (SS != ConnectedPairDeps.end()) {
-            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              if (R->second == PairConnectionDirect)
-                ++NumDepsDirect;
-              else if (R->second == PairConnectionSwap)
-                ++NumDepsSwap;
-            }
-
-            // If there are more swaps than direct connections, then
-            // the pair order will be flipped during fusion. So the real
-            // number of swaps is the minimum number.
-            FlipOrder = !FixedOrderPairs.count(*S) &&
-              ((NumDepsSwap > NumDepsDirect) ||
-                FixedOrderPairs.count(ValuePair(S->second, S->first)));
-
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              Type *Ty1 = Q.second.first->getType(),
-                   *Ty2 = Q.second.second->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-              if ((R->second == PairConnectionDirect && FlipOrder) ||
-                  (R->second == PairConnectionSwap && !FlipOrder)  ||
-                  R->second == PairConnectionSplat) {
-                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                                   VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2) {
-                  if (R->second == PairConnectionSplat)
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Broadcast, VTy));
-                  else
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Reverse, VTy));
-                }
-
-                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q.second.first << " <-> " << *Q.second.second <<
-                    "} -> {" <<
-                  *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-                EffSize -= ESContrib;
-              }
-            }
-          }
-
-          // Compute the cost of outgoing edges. We assume that edges outgoing
-          // to shuffles, inserts or extracts can be merged, and so contribute
-          // no additional cost.
-          if (!S->first->getType()->isVoidTy()) {
-            Type *Ty1 = S->first->getType(),
-                 *Ty2 = S->second->getType();
-            Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-            bool NeedsExtraction = false;
-            for (User *U : S->first->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty1->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty1, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 0);
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->first << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-
-            NeedsExtraction = false;
-            for (User *U : S->second->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty2->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty2, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy,
-                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 1);
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->second << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-          }
-
-          // Compute the cost of incoming edges.
-          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
-            Instruction *S1 = cast<Instruction>(S->first),
-                        *S2 = cast<Instruction>(S->second);
-            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
-              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
-
-              // Combining constants into vector constants (or small vector
-              // constants into larger ones are assumed free).
-              if (isa<Constant>(O1) && isa<Constant>(O2))
-                continue;
-
-              if (FlipOrder)
-                std::swap(O1, O2);
-
-              ValuePair VP  = ValuePair(O1, O2);
-              ValuePair VPR = ValuePair(O2, O1);
-
-              // Internal edges are not handled here.
-              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
-                continue;
-
-              Type *Ty1 = O1->getType(),
-                   *Ty2 = O2->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-              // Combining vector operations of the same type is also assumed
-              // folded with other operations.
-              if (Ty1 == Ty2) {
-                // If both are insert elements, then both can be widened.
-                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
-                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
-                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
-                  continue;
-                // If both are extract elements, and both have the same input
-                // type, then they can be replaced with a shuffle
-                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
-                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
-                if (EIO1 && EIO2 &&
-                    EIO1->getOperand(0)->getType() ==
-                      EIO2->getOperand(0)->getType())
-                  continue;
-                // If both are a shuffle with equal operand types and only two
-                // unqiue operands, then they can be replaced with a single
-                // shuffle
-                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
-                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
-                if (SIO1 && SIO2 &&
-                    SIO1->getOperand(0)->getType() ==
-                      SIO2->getOperand(0)->getType()) {
-                  SmallSet<Value *, 4> SIOps;
-                  SIOps.insert(SIO1->getOperand(0));
-                  SIOps.insert(SIO1->getOperand(1));
-                  SIOps.insert(SIO2->getOperand(0));
-                  SIOps.insert(SIO2->getOperand(1));
-                  if (SIOps.size() <= 2)
-                    continue;
-                }
-              }
-
-              int ESContrib;
-              // This pair has already been formed.
-              if (IncomingPairs.count(VP)) {
-                continue;
-              } else if (IncomingPairs.count(VPR)) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2)
-                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, VTy));
-              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, VTy, 0);
-                ESContrib += (int) TTI->getVectorInstrCost(
-                                     Instruction::InsertElement, VTy, 1);
-              } else if (!Ty1->isVectorTy()) {
-                // O1 needs to be inserted into a vector of size O2, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty2, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty2);
-              } else if (!Ty2->isVectorTy()) {
-                // O2 needs to be inserted into a vector of size O1, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty1, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty1);
-              } else {
-                Type *TyBig = Ty1, *TySmall = Ty2;
-                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
-                  std::swap(TyBig, TySmall);
-
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, TyBig);
-                if (TyBig != TySmall)
-                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                  TyBig, TySmall);
-              }
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
-                     << *O1 << " <-> " << *O2 << "} = " <<
-                     ESContrib << "\n");
-              EffSize -= ESContrib;
-              IncomingPairs.insert(VP);
-            }
-          }
-        }
-
-        if (!HasNontrivialInsts) {
-          DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in DAG;"
-                " override to zero effective size\n");
-          EffSize = 0;
-        }
-      } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S)
-          EffSize += (int) getDepthFactor(S->first);
-      }
-
-      DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned DAG for pair {"
-             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-             MaxDepth << " and size " << PrunedDAG.size() <<
-            " (effective size: " << EffSize << ")\n");
-      if (((TTI && !UseChainDepthWithTI) ||
-            MaxDepth >= Config.ReqChainDepth) &&
-          EffSize > 0 && EffSize > BestEffSize) {
-        BestMaxDepth = MaxDepth;
-        BestEffSize = EffSize;
-        BestDAG = PrunedDAG;
-      }
-    }
-  }
-
-  // Given the list of candidate pairs, this function selects those
-  // that will be fused into vector instructions.
-  void BBVectorize::choosePairs(
-                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                DenseSet<ValuePair> &CandidatePairsSet,
-                DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                std::vector<Value *> &PairableInsts,
-                DenseSet<ValuePair> &FixedOrderPairs,
-                DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-                DenseSet<ValuePair> &PairableInstUsers,
-                DenseMap<Value *, Value *>& ChosenPairs) {
-    bool UseCycleCheck =
-     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
-
-    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
-    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
-         E = CandidatePairsSet.end(); I != E; ++I) {
-      std::vector<Value *> &JJ = CandidatePairs2[I->second];
-      if (JJ.empty()) JJ.reserve(32);
-      JJ.push_back(I->first);
-    }
-
-    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
-    DenseSet<VPPair> PairableInstUserPairSet;
-    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
-         E = PairableInsts.end(); I != E; ++I) {
-      // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.lookup(*I).size();
-      if (!NumChoices) continue;
-
-      std::vector<Value *> &JJ = CandidatePairs[*I];
-
-      // The best pair to choose and its dag:
-      size_t BestMaxDepth = 0;
-      int BestEffSize = 0;
-      DenseSet<ValuePair> BestDAG;
-      findBestDAGFor(CandidatePairs, CandidatePairsSet,
-                      CandidatePairCostSavings,
-                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
-                      ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap,
-                      PairableInstUserPairSet, ChosenPairs,
-                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
-                      UseCycleCheck);
-
-      if (BestDAG.empty())
-        continue;
-
-      // A dag has been chosen (or not) at this point. If no dag was
-      // chosen, then this instruction, I, cannot be paired (and is no longer
-      // considered).
-
-      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
-                   << *cast<Instruction>(*I) << "\n");
-
-      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
-           SE2 = BestDAG.end(); S != SE2; ++S) {
-        // Insert the members of this dag into the list of chosen pairs.
-        ChosenPairs.insert(ValuePair(S->first, S->second));
-        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
-               *S->second << "\n");
-
-        // Remove all candidate pairs that have values in the chosen dag.
-        std::vector<Value *> &KK = CandidatePairs[S->first];
-        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
-             K != KE; ++K) {
-          if (*K == S->second)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(S->first, *K));
-        }
-
-        std::vector<Value *> &LL = CandidatePairs2[S->second];
-        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
-             L != LE; ++L) {
-          if (*L == S->first)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(*L, S->second));
-        }
-
-        std::vector<Value *> &MM = CandidatePairs[S->second];
-        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
-             M != ME; ++M) {
-          assert(*M != S->first && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(S->second, *M));
-        }
-
-        std::vector<Value *> &NN = CandidatePairs2[S->first];
-        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
-             N != NE; ++N) {
-          assert(*N != S->second && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(*N, S->first));
-        }
-      }
-    }
-
-    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
-  }
-
-  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
-                     unsigned n = 0) {
-    if (!I->hasName())
-      return "";
-
-    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
-             (n > 0 ? "." + utostr(n) : "")).str();
-  }
-
-  // Returns the value that is to be used as the pointer input to the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
-                     Instruction *I, Instruction *J, unsigned o) {
-    Value *IPtr, *JPtr;
-    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-    int64_t OffsetInElmts;
-
-    // Note: the analysis might fail here, that is why the pair order has
-    // been precomputed (OffsetInElmts must be unused here).
-    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          IAddressSpace, JAddressSpace,
-                          OffsetInElmts, false);
-
-    // The pointer value is taken to be the one with the lowest offset.
-    Value *VPtr = IPtr;
-
-    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
-    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType
-      = PointerType::get(VArgType,
-                         IPtr->getType()->getPointerAddressSpace());
-    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
-                        /* insert before */ I);
-  }
-
-  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = J->getType()->getVectorNumElements();
-    for (unsigned v = 0; v < NumElem1; ++v) {
-      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
-      if (m < 0) {
-        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
-      } else {
-        unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem1)
-          mm += (int) NumInElem;
-
-        Mask[v+MaskOffset] =
-          ConstantInt::get(Type::getInt32Ty(Context), mm);
-      }
-    }
-  }
-
-  // Returns the value that is to be used as the vector-shuffle mask to the
-  // vector instruction that fuses I with J.
-  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
-                     Instruction *I, Instruction *J) {
-    // This is the shuffle mask. We need to append the second
-    // mask to the first, and the numbers need to be adjusted.
-
-    Type *ArgTypeI = I->getType();
-    Type *ArgTypeJ = J->getType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    unsigned NumElemI = ArgTypeI->getVectorNumElements();
-
-    // Get the total number of elements in the fused vector type.
-    // By definition, this must equal the number of elements in
-    // the final mask.
-    unsigned NumElem = VArgType->getVectorNumElements();
-    std::vector<Constant*> Mask(NumElem);
-
-    Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = OpTypeI->getVectorNumElements();
-    Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
-
-    // The fused vector will be:
-    // -----------------------------------------------------
-    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
-    // -----------------------------------------------------
-    // from which we'll extract NumElem total elements (where the first NumElemI
-    // of them come from the mask in I and the remainder come from the mask
-    // in J.
-
-    // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
-                       0,          Mask);
-
-    // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
-                       NumInElemI, Mask);
-
-    return ConstantVector::get(Mask);
-  }
-
-  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
-                                  Instruction *J, unsigned o, Value *&LOp,
-                                  unsigned numElemL,
-                                  Type *ArgTypeL, Type *ArgTypeH,
-                                  bool IBeforeJ, unsigned IdxOff) {
-    bool ExpandedIEChain = false;
-    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
-      // If we have a pure insertelement chain, then this can be rewritten
-      // into a chain that directly builds the larger type.
-      if (isPureIEChain(LIE)) {
-        SmallVector<Value *, 8> VectElemts(numElemL,
-          UndefValue::get(ArgTypeL->getScalarType()));
-        InsertElementInst *LIENext = LIE;
-        do {
-          unsigned Idx =
-            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
-          VectElemts[Idx] = LIENext->getOperand(1);
-        } while ((LIENext =
-                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
-        LIENext = nullptr;
-        Value *LIEPrev = UndefValue::get(ArgTypeH);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          if (isa<UndefValue>(VectElemts[i])) continue;
-          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
-                             ConstantInt::get(Type::getInt32Ty(Context),
-                                              i + IdxOff),
-                             getReplacementName(IBeforeJ ? I : J,
-                                                true, o, i+1));
-          LIENext->insertBefore(IBeforeJ ? J : I);
-          LIEPrev = LIENext;
-        }
-
-        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
-        ExpandedIEChain = true;
-      }
-    }
-
-    return ExpandedIEChain;
-  }
-
-  static unsigned getNumScalarElements(Type *Ty) {
-    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-      return VecTy->getNumElements();
-    return 1;
-  }
-
-  // Returns the value to be used as the specified operand of the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
-    // Compute the fused vector type for this operand
-    Type *ArgTypeI = I->getOperand(o)->getType();
-    Type *ArgTypeJ = J->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    Instruction *L = I, *H = J;
-    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
-
-    unsigned numElemL = getNumScalarElements(ArgTypeL);
-    unsigned numElemH = getNumScalarElements(ArgTypeH);
-
-    Value *LOp = L->getOperand(o);
-    Value *HOp = H->getOperand(o);
-    unsigned numElem = VArgType->getNumElements();
-
-    // First, we check if we can reuse the "original" vector outputs (if these
-    // exist). We might need a shuffle.
-    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
-    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
-    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
-    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
-
-    // FIXME: If we're fusing shuffle instructions, then we can't apply this
-    // optimization. The input vectors to the shuffle might be a different
-    // length from the shuffle outputs. Unfortunately, the replacement
-    // shuffle mask has already been formed, and the mask entries are sensitive
-    // to the sizes of the inputs.
-    bool IsSizeChangeShuffle =
-      isa<ShuffleVectorInst>(L) &&
-        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
-
-    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
-      // We can have at most two unique vector inputs.
-      bool CanUseInputs = true;
-      Value *I1, *I2 = nullptr;
-      if (LEE) {
-        I1 = LEE->getOperand(0);
-      } else {
-        I1 = LSV->getOperand(0);
-        I2 = LSV->getOperand(1);
-        if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = nullptr;
-      }
-
-      if (HEE) {
-        Value *I3 = HEE->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-      } else {
-        Value *I3 = HSV->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-
-        if (CanUseInputs) {
-          Value *I4 = HSV->getOperand(1);
-          if (!isa<UndefValue>(I4)) {
-            if (!I2 && I4 != I1)
-              I2 = I4;
-            else if (I4 != I1 && I4 != I2)
-              CanUseInputs = false;
-          }
-        }
-      }
-
-      if (CanUseInputs) {
-        unsigned LOpElem =
-          cast<Instruction>(LOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        unsigned HOpElem =
-          cast<Instruction>(HOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        // We have one or two input vectors. We need to map each index of the
-        // operands to the index of the original vector.
-        SmallVector<std::pair<int, int>, 8>  II(numElem);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          int Idx, INum;
-          if (LEE) {
-            Idx =
-              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
-            INum = LEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = LSV->getMaskValue(i);
-            if (Idx < (int) LOpElem) {
-              INum = LSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= LOpElem;
-              INum = LSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i] = std::pair<int, int>(Idx, INum);
-        }
-        for (unsigned i = 0; i < numElemH; ++i) {
-          int Idx, INum;
-          if (HEE) {
-            Idx =
-              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
-            INum = HEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = HSV->getMaskValue(i);
-            if (Idx < (int) HOpElem) {
-              INum = HSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= HOpElem;
-              INum = HSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i + numElemL] = std::pair<int, int>(Idx, INum);
-        }
-
-        // We now have an array which tells us from which index of which
-        // input vector each element of the operand comes.
-        VectorType *I1T = cast<VectorType>(I1->getType());
-        unsigned I1Elem = I1T->getNumElements();
-
-        if (!I2) {
-          // In this case there is only one underlying vector input. Check for
-          // the trivial case where we can use the input directly.
-          if (I1Elem == numElem) {
-            bool ElemInOrder = true;
-            for (unsigned i = 0; i < numElem; ++i) {
-              if (II[i].first != (int) i && II[i].first != -1) {
-                ElemInOrder = false;
-                break;
-              }
-            }
-
-            if (ElemInOrder)
-              return I1;
-          }
-
-          // A shuffle is needed.
-          std::vector<Constant *> Mask(numElem);
-          for (unsigned i = 0; i < numElem; ++i) {
-            int Idx = II[i].first;
-            if (Idx == -1)
-              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
-            else
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-
-          Instruction *S =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o));
-          S->insertBefore(IBeforeJ ? J : I);
-          return S;
-        }
-
-        VectorType *I2T = cast<VectorType>(I2->getType());
-        unsigned I2Elem = I2T->getNumElements();
-
-        // This input comes from two distinct vectors. The first step is to
-        // make sure that both vectors are the same length. If not, the
-        // smaller one will need to grow before they can be shuffled together.
-        if (I1Elem < I2Elem) {
-          std::vector<Constant *> Mask(I2Elem);
-          unsigned v = 0;
-          for (; v < I1Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I2Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI1 =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI1->insertBefore(IBeforeJ ? J : I);
-          I1 = NewI1;
-          I1Elem = I2Elem;
-        } else if (I1Elem > I2Elem) {
-          std::vector<Constant *> Mask(I1Elem);
-          unsigned v = 0;
-          for (; v < I2Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I1Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI2 =
-            new ShuffleVectorInst(I2, UndefValue::get(I2T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI2->insertBefore(IBeforeJ ? J : I);
-          I2 = NewI2;
-        }
-
-        // Now that both I1 and I2 are the same length we can shuffle them
-        // together (and use the result).
-        std::vector<Constant *> Mask(numElem);
-        for (unsigned v = 0; v < numElem; ++v) {
-          if (II[v].first == -1) {
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-          } else {
-            int Idx = II[v].first + II[v].second * I1Elem;
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-        }
-
-        Instruction *NewOp =
-          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(IBeforeJ ? I : J, true, o));
-        NewOp->insertBefore(IBeforeJ ? J : I);
-        return NewOp;
-      }
-    }
-
-    Type *ArgType = ArgTypeL;
-    if (numElemL < numElemH) {
-      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
-        // This is another short-circuit case: we're combining a scalar into
-        // a vector that is formed by an IE chain. We've just expanded the IE
-        // chain, now insert the scalar and we're done.
-
-        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                           getReplacementName(IBeforeJ ? I : J, true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH, IBeforeJ)) {
-        // The two vector inputs to the shuffle must be the same length,
-        // so extend the smaller vector to be the same length as the larger one.
-        Instruction *NLOp;
-        if (numElemL > 1) {
-
-          std::vector<Constant *> Mask(numElemH);
-          unsigned v = 0;
-          for (; v < numElemL; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemH; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NLOp->insertBefore(IBeforeJ ? J : I);
-        LOp = NLOp;
-      }
-
-      ArgType = ArgTypeH;
-    } else if (numElemL > numElemH) {
-      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType, IBeforeJ)) {
-        Instruction *S =
-          InsertElementInst::Create(LOp, HOp,
-                                    ConstantInt::get(Type::getInt32Ty(Context),
-                                                     numElemL),
-                                    getReplacementName(IBeforeJ ? I : J,
-                                                       true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL, IBeforeJ)) {
-        Instruction *NHOp;
-        if (numElemH > 1) {
-          std::vector<Constant *> Mask(numElemL);
-          unsigned v = 0;
-          for (; v < numElemH; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemL; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NHOp->insertBefore(IBeforeJ ? J : I);
-        HOp = NHOp;
-      }
-    }
-
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = VArgType->getVectorNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v) {
-        unsigned Idx = v;
-        // If the low vector was expanded, we need to skip the extra
-        // undefined entries.
-        if (v >= numElemL && numElemH > numElemL)
-          Idx += (numElemH - numElemL);
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-      }
-
-      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                          ConstantVector::get(Mask),
-                          getReplacementName(IBeforeJ ? I : J, true, o));
-      BV->insertBefore(IBeforeJ ? J : I);
-      return BV;
-    }
-
-    Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 1));
-    BV1->insertBefore(IBeforeJ ? J : I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 2));
-    BV2->insertBefore(IBeforeJ ? J : I);
-    return BV2;
-  }
-
-  // This function creates an array of values that will be used as the inputs
-  // to the vector instruction that fuses I with J.
-  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
-                     Instruction *I, Instruction *J,
-                     SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ) {
-    unsigned NumOperands = I->getNumOperands();
-
-    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
-      // Iterate backward so that we look at the store pointer
-      // first and know whether or not we need to flip the inputs.
-
-      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
-        // This is the pointer for a load/store instruction.
-        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
-        continue;
-      } else if (isa<CallInst>(I)) {
-        Function *F = cast<CallInst>(I)->getCalledFunction();
-        Intrinsic::ID IID = F->getIntrinsicID();
-        if (o == NumOperands-1) {
-          BasicBlock &BB = *I->getParent();
-
-          Module *M = BB.getParent()->getParent();
-          Type *ArgTypeI = I->getType();
-          Type *ArgTypeJ = J->getType();
-          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
-          continue;
-        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-                    IID == Intrinsic::cttz) && o == 1) {
-          // The second argument of powi/ctlz/cttz is a single integer/constant
-          // and we've already checked that both arguments are equal.
-          // As a result, we just keep I's second argument.
-          ReplacedOperands[o] = I->getOperand(o);
-          continue;
-        }
-      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
-        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
-        continue;
-      }
-
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
-    }
-  }
-
-  // This function creates two values that represent the outputs of the
-  // original I and J instructions. These are generally vector shuffles
-  // or extracts. In many cases, these will end up being unused and, thus,
-  // eliminated by later passes.
-  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt,
-                     Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I))
-      return;
-
-    Type *IType = I->getType();
-    Type *JType = J->getType();
-
-    VectorType *VType = getVecTypeForPair(IType, JType);
-    unsigned numElem = VType->getNumElements();
-
-    unsigned numElemI = getNumScalarElements(IType);
-    unsigned numElemJ = getNumScalarElements(JType);
-
-    if (IType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
-      for (unsigned v = 0; v < numElemI; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
-      }
-
-      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask1),
-                                 getReplacementName(K, false, 1));
-    } else {
-      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
-    }
-
-    if (JType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
-      for (unsigned v = 0; v < numElemJ; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
-      }
-
-      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask2),
-                                 getReplacementName(K, false, 2));
-    } else {
-      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
-      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
-    }
-
-    K1->insertAfter(K);
-    K2->insertAfter(K1);
-    InsertionPt = K2;
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J; ++L)
-      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
-
-    assert(cast<Instruction>(L) == J &&
-      "Tracking has not proceeded far enough to check for dependencies");
-    // If J is now in the use set of I, then trackUsesOfI will return true
-    // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
-        // Move this instruction
-        Instruction *InstToMove = &*L++;
-
-        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
-                        " to after " << *InsertionPt << "\n");
-        InstToMove->removeFromParent();
-        InstToMove->insertAfter(InsertionPt);
-        InsertionPt = InstToMove;
-      } else {
-        ++L;
-      }
-    }
-  }
-
-  // Collect all load instruction that are in the move set of a given first
-  // pair member.  These loads depend on the first instruction, I, and so need
-  // to be moved after J (the second instruction) when the pair is fused.
-  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    // Note: We cannot end the loop when we reach J because J could be moved
-    // farther down the use chain by another instruction pairing. Also, J
-    // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
-        if (L->mayReadFromMemory()) {
-          LoadMoveSet[&*L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(&*L, I));
-        }
-      }
-    }
-  }
-
-  // In cases where both load/stores and the computation of their pointers
-  // are chosen for vectorization, we can end up in a situation where the
-  // aliasing analysis starts returning different query results as the
-  // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use dags here as were found earlier, we'll
-  // need to precompute the necessary aliasing information here and then
-  // manually update it during the fusion process.
-  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PIE = PairableInsts.end(); PI != PIE; ++PI) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
-      if (P == ChosenPairs.end()) continue;
-
-      Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
-                             LoadMoveSetPairs, I);
-    }
-  }
-
-  // This function fuses the chosen instruction pairs into vector instructions,
-  // taking care preserve any needed scalar outputs and, then, it reorders the
-  // remaining instructions as needed (users of the first member of the pair
-  // need to be moved to after the location of the second member of the pair
-  // because the vector instruction is inserted in the location of the pair's
-  // second member).
-  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
-    LLVMContext& Context = BB.getContext();
-
-    // During the vectorization process, the order of the pairs to be fused
-    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
-    // list. After a pair is fused, the flipped pair is removed from the list.
-    DenseSet<ValuePair> FlippedPairs;
-    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
-         E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.insert(ValuePair(P->second, P->first));
-    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
-         E = FlippedPairs.end(); P != E; ++P)
-      ChosenPairs.insert(*P);
-
-    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
-    DenseSet<ValuePair> LoadMoveSetPairs;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
-                       LoadMoveSet, LoadMoveSetPairs);
-
-    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
-
-    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
-      if (P == ChosenPairs.end()) {
-        ++PI;
-        continue;
-      }
-
-      if (getDepthFactor(P->first) == 0) {
-        // These instructions are not really fused, but are tracked as though
-        // they are. Any case in which it would be interesting to fuse them
-        // will be taken care of by InstCombine.
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      Instruction *I = cast<Instruction>(P->first),
-        *J = cast<Instruction>(P->second);
-
-      DEBUG(dbgs() << "BBV: fusing: " << *I <<
-             " <-> " << *J << "\n");
-
-      // Remove the pair and flipped pair from the list.
-      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
-      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
-      ChosenPairs.erase(FP);
-      ChosenPairs.erase(P);
-
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
-        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
-               " <-> " << *J <<
-               " aborted because of non-trivial dependency cycle\n");
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      // If the pair must have the other order, then flip it.
-      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
-      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
-        // This pair does not have a fixed order, and so we might want to
-        // flip it if that will yield fewer shuffles. We count the number
-        // of dependencies connected via swaps, and those directly connected,
-        // and flip the order if the number of swaps is greater.
-        bool OrigOrder = true;
-        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
-          ConnectedPairDeps.find(ValuePair(I, J));
-        if (IJ == ConnectedPairDeps.end()) {
-          IJ = ConnectedPairDeps.find(ValuePair(J, I));
-          OrigOrder = false;
-        }
-
-        if (IJ != ConnectedPairDeps.end()) {
-          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
-               TE = IJ->second.end(); T != TE; ++T) {
-            VPPair Q(IJ->first, *T);
-            DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q.second, Q.first));
-            assert(R != PairConnectionTypes.end() &&
-                   "Cannot find pair connection type");
-            if (R->second == PairConnectionDirect)
-              ++NumDepsDirect;
-            else if (R->second == PairConnectionSwap)
-              ++NumDepsSwap;
-          }
-
-          if (!OrigOrder)
-            std::swap(NumDepsDirect, NumDepsSwap);
-
-          if (NumDepsSwap > NumDepsDirect) {
-            FlipPairOrder = true;
-            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
-                            " <-> " << *J << "\n");
-          }
-        }
-      }
-
-      Instruction *L = I, *H = J;
-      if (FlipPairOrder)
-        std::swap(H, L);
-
-      // If the pair being fused uses the opposite order from that in the pair
-      // connection map, then we need to flip the types.
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
-        ConnectedPairs.find(ValuePair(H, L));
-      if (HL != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
-             TE = HL->second.end(); T != TE; ++T) {
-          VPPair Q(HL->first, *T);
-          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
-          assert(R != PairConnectionTypes.end() &&
-                 "Cannot find pair connection type");
-          if (R->second == PairConnectionDirect)
-            R->second = PairConnectionSwap;
-          else if (R->second == PairConnectionSwap)
-            R->second = PairConnectionDirect;
-        }
-
-      bool LBeforeH = !FlipPairOrder;
-      unsigned NumOperands = I->getNumOperands();
-      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
-                                  LBeforeH);
-
-      // Make a copy of the original operation, change its type to the vector
-      // type and replace its operands with the vector operands.
-      Instruction *K = L->clone();
-      if (L->hasName())
-        K->takeName(L);
-      else if (H->hasName())
-        K->takeName(H);
-
-      if (auto CS = CallSite(K)) {
-        SmallVector<Type *, 3> Tys;
-        FunctionType *Old = CS.getFunctionType();
-        unsigned NumOld = Old->getNumParams();
-        assert(NumOld <= ReplacedOperands.size());
-        for (unsigned i = 0; i != NumOld; ++i)
-          Tys.push_back(ReplacedOperands[i]->getType());
-        CS.mutateFunctionType(
-            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
-                              Tys, Old->isVarArg()));
-      } else if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
-
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
-                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
-                             LLVMContext::MD_invariant_group};
-      combineMetadata(K, H, KnownIDs);
-      K->andIRFlags(H);
-
-      for (unsigned o = 0; o < NumOperands; ++o)
-        K->setOperand(o, ReplacedOperands[o]);
-
-      K->insertAfter(J);
-
-      // Instruction insertion point:
-      Instruction *InsertionPt = K;
-      Instruction *K1 = nullptr, *K2 = nullptr;
-      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
-
-      // The use dag of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use dag of the
-      // first instruction is disjoint from the input dag of the second
-      // (by definition), and so commutes with it.
-
-      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
-
-      if (!isa<StoreInst>(I)) {
-        L->replaceAllUsesWith(K1);
-        H->replaceAllUsesWith(K2);
-      }
-
-      // Instructions that may read from memory may be in the load move set.
-      // Once an instruction is fused, we no longer need its move set, and so
-      // the values of the map never need to be updated. However, when a load
-      // is fused, we need to merge the entries from both instructions in the
-      // pair in case those instructions were in the move set of some other
-      // yet-to-be-fused pair. The loads in question are the keys of the map.
-      if (I->mayReadFromMemory()) {
-        std::vector<ValuePair> NewSetMembers;
-        DenseMap<Value *, std::vector<Value *> >::iterator II =
-          LoadMoveSet.find(I);
-        if (II != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = II->second.begin(),
-               NE = II->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
-          LoadMoveSet.find(J);
-        if (JJ != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = JJ->second.begin(),
-               NE = JJ->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A) {
-          LoadMoveSet[A->first].push_back(A->second);
-          LoadMoveSetPairs.insert(*A);
-        }
-      }
-
-      // Before removing I, set the iterator to the next instruction.
-      PI = std::next(BasicBlock::iterator(I));
-      if (cast<Instruction>(PI) == J)
-        ++PI;
-
-      SE->forgetValue(I);
-      SE->forgetValue(J);
-      I->eraseFromParent();
-      J->eraseFromParent();
-
-      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
-                                               BB << "\n");
-    }
-
-    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
-  }
-}
-
-char BBVectorize::ID = 0;
-static const char bb_vectorize_name[] = "Basic-Block Vectorization";
-INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-
-BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
-  return new BBVectorize(C);
-}
-
-bool
-llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, *BB.getParent(), C);
-  return BBVectorizer.vectorizeBB(BB);
-}
-
-//===----------------------------------------------------------------------===//
-VectorizeConfig::VectorizeConfig() {
-  VectorBits = ::VectorBits;
-  VectorizeBools = !::NoBools;
-  VectorizeInts = !::NoInts;
-  VectorizeFloats = !::NoFloats;
-  VectorizePointers = !::NoPointers;
-  VectorizeCasts = !::NoCasts;
-  VectorizeMath = !::NoMath;
-  VectorizeBitManipulations = !::NoBitManipulation;
-  VectorizeFMA = !::NoFMA;
-  VectorizeSelect = !::NoSelect;
-  VectorizeCmp = !::NoCmp;
-  VectorizeGEP = !::NoGEP;
-  VectorizeMemOps = !::NoMemOps;
-  AlignedOnly = ::AlignedOnly;
-  ReqChainDepth= ::ReqChainDepth;
-  SearchLimit = ::SearchLimit;
-  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
-  SplatBreaksChain = ::SplatBreaksChain;
-  MaxInsts = ::MaxInsts;
-  MaxPairs = ::MaxPairs;
-  MaxIter = ::MaxIter;
-  Pow2LenOnly = ::Pow2LenOnly;
-  NoMemOpBoost = ::NoMemOpBoost;
-  FastDep = ::FastDep;
-}
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 395f440bda47..1aea73cd4a32 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMVectorize
-  BBVectorize.cpp
   LoadStoreVectorizer.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index eac2867233bc..193cc4d13787 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -114,12 +114,13 @@ static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                        cl::desc("Enable if-conversion during vectorization."));
 
-/// We don't vectorize loops with a known constant trip count below this number.
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
 static cl::opt<unsigned> TinyTripCountVectorThreshold(
     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
-    cl::desc("Don't vectorize loops with a constant "
-             "trip count that is smaller than this "
-             "value."));
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -532,21 +533,34 @@ protected:
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// Return a constant reference to the VectorParts corresponding to \p V from
-  /// the original loop. If the value has already been vectorized, the
-  /// corresponding vector entry in VectorLoopValueMap is returned. If,
+  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+  /// vector or scalar value on-demand if one is not yet available. When
+  /// vectorizing a loop, we visit the definition of an instruction before its
+  /// uses. When visiting the definition, we either vectorize or scalarize the
+  /// instruction, creating an entry for it in the corresponding map. (In some
+  /// cases, such as induction variables, we will create both vector and scalar
+  /// entries.) Then, as we encounter uses of the definition, we derive values
+  /// for each scalar or vector use unless such a value is already available.
+  /// For example, if we scalarize a definition and one of its uses is vector,
+  /// we build the required vector on-demand with an insertelement sequence
+  /// when visiting the use. Otherwise, if the use is scalar, we can use the
+  /// existing scalar definition.
+  ///
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part. If the value has already been vectorized,
+  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
-  /// new vector values on-demand by inserting the scalar values into vectors
+  /// a new vector value on-demand by inserting the scalar values into a vector
   /// with an insertelement sequence. If the value has been neither vectorized
   /// nor scalarized, it must be loop invariant, so we simply broadcast the
-  /// value into vectors.
-  const VectorParts &getVectorValue(Value *V);
+  /// value into a vector.
+  Value *getOrCreateVectorValue(Value *V, unsigned Part);
 
   /// Return a value in the new loop corresponding to \p V from the original
   /// loop at unroll index \p Part and vector index \p Lane. If the value has
   /// been vectorized but not scalarized, the necessary extractelement
   /// instruction will be generated.
-  Value *getScalarValue(Value *V, unsigned Part, unsigned Lane);
+  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
 
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
@@ -601,90 +615,103 @@ protected:
   /// UF x VF scalar values in the new loop. UF and VF are the unroll and
   /// vectorization factors, respectively.
   ///
-  /// Entries can be added to either map with initVector and initScalar, which
-  /// initialize and return a constant reference to the new entry. If a
-  /// non-constant reference to a vector entry is required, getVector can be
-  /// used to retrieve a mutable entry. We currently directly modify the mapped
-  /// values during "fix-up" operations that occur once the first phase of
-  /// widening is complete. These operations include type truncation and the
-  /// second phase of recurrence widening.
+  /// Entries can be added to either map with setVectorValue and setScalarValue,
+  /// which assert that an entry was not already added before. If an entry is to
+  /// replace an existing one, call resetVectorValue. This is currently needed
+  /// to modify the mapped values during "fix-up" operations that occur once the
+  /// first phase of widening is complete. These operations include type
+  /// truncation and the second phase of recurrence widening.
   ///
-  /// Otherwise, entries from either map should be accessed using the
-  /// getVectorValue or getScalarValue functions from InnerLoopVectorizer.
-  /// getVectorValue and getScalarValue coordinate to generate a vector or
-  /// scalar value on-demand if one is not yet available. When vectorizing a
-  /// loop, we visit the definition of an instruction before its uses. When
-  /// visiting the definition, we either vectorize or scalarize the
-  /// instruction, creating an entry for it in the corresponding map. (In some
-  /// cases, such as induction variables, we will create both vector and scalar
-  /// entries.) Then, as we encounter uses of the definition, we derive values
-  /// for each scalar or vector use unless such a value is already available.
-  /// For example, if we scalarize a definition and one of its uses is vector,
-  /// we build the required vector on-demand with an insertelement sequence
-  /// when visiting the use. Otherwise, if the use is scalar, we can use the
-  /// existing scalar definition.
+  /// Entries from either map can be retrieved using the getVectorValue and
+  /// getScalarValue functions, which assert that the desired value exists.
+
   struct ValueMap {
 
     /// Construct an empty map with the given unroll and vectorization factors.
-    ValueMap(unsigned UnrollFactor, unsigned VecWidth)
-        : UF(UnrollFactor), VF(VecWidth) {
-      // The unroll and vectorization factors are only used in asserts builds
-      // to verify map entries are sized appropriately.
-      (void)UF;
-      (void)VF;
+    ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+    /// \return True if the map has any vector entry for \p Key.
+    bool hasAnyVectorValue(Value *Key) const {
+      return VectorMapStorage.count(Key);
+    }
+
+    /// \return True if the map has a vector entry for \p Key and \p Part.
+    bool hasVectorValue(Value *Key, unsigned Part) const {
+      assert(Part < UF && "Queried Vector Part is too large.");
+      if (!hasAnyVectorValue(Key))
+        return false;
+      const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+      return Entry[Part] != nullptr;
     }
 
-    /// \return True if the map has a vector entry for \p Key.
-    bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); }
-
-    /// \return True if the map has a scalar entry for \p Key.
-    bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); }
-
-    /// \brief Map \p Key to the given VectorParts \p Entry, and return a
-    /// constant reference to the new vector map entry. The given key should
-    /// not already be in the map, and the given VectorParts should be
-    /// correctly sized for the current unroll factor.
-    const VectorParts &initVector(Value *Key, const VectorParts &Entry) {
-      assert(!hasVector(Key) && "Vector entry already initialized");
-      assert(Entry.size() == UF && "VectorParts has wrong dimensions");
-      VectorMapStorage[Key] = Entry;
-      return VectorMapStorage[Key];
+    /// \return True if the map has any scalar entry for \p Key.
+    bool hasAnyScalarValue(Value *Key) const {
+      return ScalarMapStorage.count(Key);
     }
 
-    /// \brief Map \p Key to the given ScalarParts \p Entry, and return a
-    /// constant reference to the new scalar map entry. The given key should
-    /// not already be in the map, and the given ScalarParts should be
-    /// correctly sized for the current unroll and vectorization factors.
-    const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) {
-      assert(!hasScalar(Key) && "Scalar entry already initialized");
-      assert(Entry.size() == UF &&
-             all_of(make_range(Entry.begin(), Entry.end()),
-                    [&](const SmallVectorImpl<Value *> &Values) -> bool {
-                      return Values.size() == VF;
-                    }) &&
-             "ScalarParts has wrong dimensions");
-      ScalarMapStorage[Key] = Entry;
-      return ScalarMapStorage[Key];
+    /// \return True if the map has a scalar entry for \p Key, \p Part and
+    /// \p Part.
+    bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
+      assert(Part < UF && "Queried Scalar Part is too large.");
+      assert(Lane < VF && "Queried Scalar Lane is too large.");
+      if (!hasAnyScalarValue(Key))
+        return false;
+      const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+      assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
+      return Entry[Part][Lane] != nullptr;
     }
 
-    /// \return A reference to the vector map entry corresponding to \p Key.
-    /// The key should already be in the map. This function should only be used
-    /// when it's necessary to update values that have already been vectorized.
-    /// This is the case for "fix-up" operations including type truncation and
-    /// the second phase of recurrence vectorization. If a non-const reference
-    /// isn't required, getVectorValue should be used instead.
-    VectorParts &getVector(Value *Key) {
-      assert(hasVector(Key) && "Vector entry not initialized");
-      return VectorMapStorage.find(Key)->second;
+    /// Retrieve the existing vector value that corresponds to \p Key and
+    /// \p Part.
+    Value *getVectorValue(Value *Key, unsigned Part) {
+      assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+      return VectorMapStorage[Key][Part];
     }
 
-    /// Retrieve an entry from the vector or scalar maps. The preferred way to
-    /// access an existing mapped entry is with getVectorValue or
-    /// getScalarValue from InnerLoopVectorizer. Until those functions can be
-    /// moved inside ValueMap, we have to declare them as friends.
-    friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V);
-    friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                                      unsigned Lane);
+    /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
+    /// and \p Lane.
+    Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
+      assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
+      return ScalarMapStorage[Key][Part][Lane];
+    }
+
+    /// Set a vector value associated with \p Key and \p Part. Assumes such a
+    /// value is not already set. If it is, use resetVectorValue() instead.
+    void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+      if (!VectorMapStorage.count(Key)) {
+        VectorParts Entry(UF);
+        VectorMapStorage[Key] = Entry;
+      }
+      VectorMapStorage[Key][Part] = Vector;
+    }
+
+    /// Set a scalar value associated with \p Key for \p Part and \p Lane.
+    /// Assumes such a value is not already set.
+    void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
+                        Value *Scalar) {
+      assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
+      if (!ScalarMapStorage.count(Key)) {
+        ScalarParts Entry(UF);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part].resize(VF, nullptr);
+          // TODO: Consider storing uniform values only per-part, as they occupy
+          //       lane 0 only, keeping the other VF-1 redundant entries null.
+        ScalarMapStorage[Key] = Entry;
+      }
+      ScalarMapStorage[Key][Part][Lane] = Scalar;
+    }
+
+    /// Reset the vector value associated with \p Key for the given \p Part.
+    /// This function can be used to update values that have already been
+    /// vectorized. This is the case for "fix-up" operations including type
+    /// truncation and the second phase of recurrence vectorization.
+    void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+      VectorMapStorage[Key][Part] = Vector;
+    }
 
   private:
     /// The unroll factor. Each entry in the vector map contains UF vector
@@ -1577,6 +1604,9 @@ public:
   /// Return the first-order recurrences found in the loop.
   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
 
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -1779,6 +1809,9 @@ private:
   InductionList Inductions;
   /// Holds the phi nodes that are first-order recurrences.
   RecurrenceSet FirstOrderRecurrences;
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
   /// Holds the widest induction type encountered.
   Type *WidestIndTy;
 
@@ -2417,15 +2450,13 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                     &*LoopVectorBody->getFirstInsertionPt());
   Instruction *LastInduction = VecInd;
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part] = LastInduction;
+    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+    if (isa<TruncInst>(EntryVal))
+      addMetadata(LastInduction, EntryVal);
     LastInduction = cast<Instruction>(addFastMathFlag(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
-  VectorLoopValueMap.initVector(EntryVal, Entry);
-  if (isa<TruncInst>(EntryVal))
-    addMetadata(Entry, EntryVal);
 
   // Move the last step to the end of the latch block. This ensures consistent
   // placement of all induction updates.
@@ -2531,13 +2562,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable, and build the necessary step vectors.
   if (!VectorizedIV) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
-    VectorParts Entry(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] =
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart =
           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
-    VectorLoopValueMap.initVector(EntryVal, Entry);
-    if (Trunc)
-      addMetadata(Entry, Trunc);
+      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+      if (Trunc)
+        addMetadata(EntryPart, Trunc);
+    }
   }
 
   // If an induction variable is only used for counting loop iterations or
@@ -2637,17 +2668,14 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
     Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
-  ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
-      Entry[Part][Lane] = Add;
+      VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
     }
   }
-  VectorLoopValueMap.initScalar(EntryVal, Entry);
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
@@ -2665,8 +2693,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-const InnerLoopVectorizer::VectorParts &
-InnerLoopVectorizer::getVectorValue(Value *V) {
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
@@ -2675,17 +2702,16 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   if (Legal->hasStride(V))
     V = ConstantInt::get(V->getType(), 1);
 
-  // If we have this scalar in the map, return it.
-  if (VectorLoopValueMap.hasVector(V))
-    return VectorLoopValueMap.VectorMapStorage[V];
+  // If we have a vector mapped to this value, return it.
+  if (VectorLoopValueMap.hasVectorValue(V, Part))
+    return VectorLoopValueMap.getVectorValue(V, Part);
 
   // If the value has not been vectorized, check if it has been scalarized
   // instead. If it has been scalarized, and we actually need the value in
   // vector form, we will construct the vector values on demand.
-  if (VectorLoopValueMap.hasScalar(V)) {
+  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
 
-    // Initialize a new vector map entry.
-    VectorParts Entry(UF);
+    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
 
     // If we've scalarized a value, that value should be an instruction.
     auto *I = cast<Instruction>(V);
@@ -2693,17 +2719,17 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
     if (VF == 1) {
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = getScalarValue(V, Part, 0);
-      return VectorLoopValueMap.initVector(V, Entry);
+      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+      return ScalarValue;
     }
 
-    // Get the last scalar instruction we generated for V. If the value is
-    // known to be uniform after vectorization, this corresponds to lane zero
-    // of the last unroll iteration. Otherwise, the last instruction is the one
-    // we created for the last vector lane of the last unroll iteration.
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
-    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
+    auto *LastInst =
+        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
     // the insertelement sequence will directly follow the scalar definitions.
@@ -2717,52 +2743,50 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // iteration. Otherwise, we construct the vector values using insertelement
     // instructions. Since the resulting vectors are stored in
     // VectorLoopValueMap, we will only generate the insertelements once.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *VectorValue = nullptr;
-      if (Cost->isUniformAfterVectorization(I, VF)) {
-        VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
-      } else {
-        VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
-        for (unsigned Lane = 0; Lane < VF; ++Lane)
-          VectorValue = Builder.CreateInsertElement(
-              VectorValue, getScalarValue(V, Part, Lane),
-              Builder.getInt32(Lane));
-      }
-      Entry[Part] = VectorValue;
+    Value *VectorValue = nullptr;
+    if (Cost->isUniformAfterVectorization(I, VF)) {
+      VectorValue = getBroadcastInstrs(ScalarValue);
+    } else {
+      VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+      for (unsigned Lane = 0; Lane < VF; ++Lane)
+        VectorValue = Builder.CreateInsertElement(
+            VectorValue, getOrCreateScalarValue(V, Part, Lane),
+            Builder.getInt32(Lane));
     }
+    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     Builder.restoreIP(OldIP);
-    return VectorLoopValueMap.initVector(V, Entry);
+    return VectorValue;
   }
 
   // If this scalar is unknown, assume that it is a constant or that it is
   // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
+  VectorLoopValueMap.setVectorValue(V, Part, B);
+  return B;
 }
 
-Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                           unsigned Lane) {
+Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
+                                                   unsigned Lane) {
 
   // If the value is not an instruction contained in the loop, it should
   // already be scalar.
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ?
-         !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
-         : true && "Uniform values only have lane zero");
+  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+                  : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
   // scalar value.
-  if (VectorLoopValueMap.hasScalar(V))
-    return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
+  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
+    return VectorLoopValueMap.getScalarValue(V, Part, Lane);
 
   // If the value has not been scalarized, get its entry in VectorLoopValueMap
   // for the given unroll part. If this entry is not a vector type (i.e., the
   // vectorization factor is one), there is no need to generate an
   // extractelement instruction.
-  auto *U = getVectorValue(V)[Part];
+  auto *U = getOrCreateVectorValue(V, Part);
   if (!U->getType()->isVectorTy()) {
     assert(VF == 1 && "Value not scalarized has non-vector type");
     return U;
@@ -2844,7 +2868,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Index += (VF - 1) * Group->getFactor();
 
   for (unsigned Part = 0; Part < UF; Part++) {
-    Value *NewPtr = getScalarValue(Ptr, Part, 0);
+    Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
 
     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
@@ -2887,7 +2911,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       if (!Member)
         continue;
 
-      VectorParts Entry(UF);
       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
@@ -2899,10 +2922,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
         }
 
-        Entry[Part] =
-            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+        if (Group->isReverse())
+          StridedVec = reverseVector(StridedVec);
+
+        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
       }
-      VectorLoopValueMap.initVector(Member, Entry);
     }
     return;
   }
@@ -2919,8 +2943,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       Instruction *Member = Group->getMember(i);
       assert(Member && "Fail to get a member from an interleaved store group");
 
-      Value *StoredVec =
-          getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
+      Value *StoredVec = getOrCreateVectorValue(
+          cast<StoreInst>(Member)->getValueOperand(), Part);
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2981,16 +3005,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   bool CreateGatherScatter =
       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
-  VectorParts VectorGep;
+  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+  // gather/scatter. Otherwise Decision should have been to Scalarize.
+  assert((ConsecutiveStride || CreateGatherScatter) &&
+         "The instruction should be scalarized");
 
   // Handle consecutive loads/stores.
-  if (ConsecutiveStride) {
-    Ptr = getScalarValue(Ptr, 0, 0);
-  } else {
-    // At this point we should vector version of GEP for Gather or Scatter
-    assert(CreateGatherScatter && "The instruction should be scalarized");
-    VectorGep = getVectorValue(Ptr);
-  }
+  if (ConsecutiveStride)
+    Ptr = getOrCreateScalarValue(Ptr, 0, 0);
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
   // Handle Stores:
@@ -2998,16 +3020,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     assert(!Legal->isUniform(SI->getPointerOperand()) &&
            "We do not allow storing to uniform addresses");
     setDebugLocFromInst(Builder, SI);
-    // We don't want to update the value in the map as it might be used in
-    // another expression. So don't use a reference type for "StoredVal".
-    VectorParts StoredVal = getVectorValue(SI->getValueOperand());
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Instruction *NewSI = nullptr;
+      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
       if (CreateGatherScatter) {
         Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
-        NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
-                                            Alignment, MaskPart);
+        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
       } else {
         // Calculate the pointer for the specific unroll-part.
         Value *PartPtr =
@@ -3016,7 +3037,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
           // to reverse the order of elements in the stored value.
-          StoredVal[Part] = reverseVector(StoredVal[Part]);
+          StoredVal = reverseVector(StoredVal);
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+
           // If the address is consecutive but reversed, then the
           // wide store needs to start at the last vector element.
           PartPtr =
@@ -3030,11 +3054,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
 
         if (Legal->isMaskRequired(SI))
-          NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             Mask[Part]);
         else
-          NewSI =
-              Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
       addMetadata(NewSI, SI);
     }
@@ -3044,14 +3067,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Handle loads.
   assert(LI && "Must have a load instruction");
   setDebugLocFromInst(Builder, LI);
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Instruction *NewLI;
+    Value *NewLI;
     if (CreateGatherScatter) {
       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
-      NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
+      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
                                          nullptr, "wide.masked.gather");
-      Entry[Part] = NewLI;
+      addMetadata(NewLI, LI);
     } else {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr =
@@ -3073,11 +3096,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
                                          "wide.masked.load");
       else
         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
-      Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = reverseVector(NewLI);
     }
-    addMetadata(NewLI, LI);
+    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
   }
-  VectorLoopValueMap.initVector(Instr, Entry);
 }
 
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
@@ -3094,9 +3120,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
-
   VectorParts Cond;
   if (IfPredicateInstr)
     Cond = createBlockInMask(Instr->getParent());
@@ -3108,7 +3131,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     // For each scalar that we create:
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
 
@@ -3129,7 +3151,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       // Replace the operands of the cloned instructions with their scalar
       // equivalents in the new loop.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane);
+        auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
         Cloned->setOperand(op, NewOp);
       }
       addNewMetadata(Cloned, Instr);
@@ -3138,7 +3160,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       Builder.Insert(Cloned);
 
       // Add the cloned scalar to the scalar map entry.
-      Entry[Part][Lane] = Cloned;
+      VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
 
       // If we just cloned a new assumption, add it the assumption cache.
       if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
@@ -3150,7 +3172,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
         PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
     }
   }
-  VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -3786,10 +3807,10 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
         continue;
       Type *OriginalTy = I->getType();
@@ -3878,7 +3899,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       I->replaceAllUsesWith(Res);
       cast<Instruction>(I)->eraseFromParent();
       Erased.insert(I);
-      I = Res;
+      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
     }
   }
 
@@ -3887,15 +3908,15 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
       if (Inst && Inst->use_empty()) {
         Value *NewI = Inst->getOperand(0);
         Inst->eraseFromParent();
-        I = NewI;
+        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
       }
     }
   }
@@ -4025,28 +4046,29 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We constructed a temporary phi node in the first phase of vectorization.
   // This phi node will eventually be deleted.
-  VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
-  Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
+  Builder.SetInsertPoint(
+      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
 
   // Create a phi node for the new recurrence. The current value will either be
   // the initial value inserted into a vector or loop-varying vector value.
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value.
-  auto &PreviousParts = getVectorValue(Previous);
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
+  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
 
   // Set the insertion point after the previous value if it is an instruction.
   // Note that the previous value may have been constant-folded so it is not
   // guaranteed to be an instruction in the vector loop. Also, if the previous
   // value is a phi node, we should insert after all the phi nodes to avoid
   // breaking basic block verification.
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
-      isa<PHINode>(PreviousParts[UF - 1]))
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
+      isa<PHINode>(PreviousLastPart))
     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
   else
     Builder.SetInsertPoint(
-        &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -4061,15 +4083,16 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Shuffle the current and previous vector and update the vector parts.
   for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
     auto *Shuffle =
-        VF > 1
-            ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
-                                          ConstantVector::get(ShuffleMask))
-            : Incoming;
-    PhiParts[Part]->replaceAllUsesWith(Shuffle);
-    cast<Instruction>(PhiParts[Part])->eraseFromParent();
-    PhiParts[Part] = Shuffle;
-    Incoming = PreviousParts[Part];
+        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+                                             ConstantVector::get(ShuffleMask))
+               : Incoming;
+    PhiPart->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiPart)->eraseFromParent();
+    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+    Incoming = PreviousPart;
   }
 
   // Fix the latch value of the new recurrence in the vector loop.
@@ -4097,7 +4120,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // `Incoming`. This is analogous to the vectorized case above: extracting the
   // second last element when VF > 1.
   else if (UF > 1)
-    ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2];
+    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
 
   // Fix the initial value of the original recurrence in the scalar loop.
   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
@@ -4148,8 +4171,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
 
   // This is the vector-clone of the value that leaves the loop.
-  const VectorParts &VectorExit = getVectorValue(LoopExitInst);
-  Type *VecTy = VectorExit[0]->getType();
+  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
 
   // Find the reduction identity variable. Zero for addition, or, xor,
   // one for multiplication, -1 for And.
@@ -4187,18 +4209,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
-  const VectorParts &VecRdxPhi = getVectorValue(Phi);
   BasicBlock *Latch = OrigLoop->getLoopLatch();
   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-  const VectorParts &Val = getVectorValue(LoopVal);
-  for (unsigned part = 0; part < UF; ++part) {
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+    Value *Val = getOrCreateVectorValue(LoopVal, Part);
     // Make sure to add the reduction stat value only to the
     // first unroll part.
-    Value *StartVal = (part == 0) ? VectorStart : Identity;
-    cast<PHINode>(VecRdxPhi[part])
-      ->addIncoming(StartVal, LoopVectorPreHeader);
-    cast<PHINode>(VecRdxPhi[part])
-      ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+    Value *StartVal = (Part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi)
+      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   }
 
   // Before each round, move the insertion point right between
@@ -4207,7 +4228,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // instructions.
   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
 
-  VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
   setDebugLocFromInst(Builder, LoopExitInst);
 
   // If the vector reduction can be performed in a smaller type, we truncate
@@ -4216,37 +4236,42 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(LoopVectorBody->getTerminator());
-    for (unsigned part = 0; part < UF; ++part) {
-      Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
-        : Builder.CreateZExt(Trunc, VecTy);
-      for (Value::user_iterator UI = RdxParts[part]->user_begin();
-           UI != RdxParts[part]->user_end();)
+                                        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+           UI != RdxParts[Part]->user_end();)
         if (*UI != Trunc) {
-          (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
-          RdxParts[part] = Extnd;
+          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+          RdxParts[Part] = Extnd;
         } else {
           ++UI;
         }
     }
     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-    for (unsigned part = 0; part < UF; ++part)
-      RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+    }
   }
 
   // Reduce all of the unrolled parts into a single vector.
-  Value *ReducedPartRdx = RdxParts[0];
+  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
   setDebugLocFromInst(Builder, ReducedPartRdx);
-  for (unsigned part = 1; part < UF; ++part) {
+  for (unsigned Part = 1; Part < UF; ++Part) {
+    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
       // Floating point operations had to be 'fast' to enable the reduction.
       ReducedPartRdx = addFastMathFlag(
-          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
                               ReducedPartRdx, "bin.rdx"));
     else
       ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-          Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
+          Builder, MinMaxKind, ReducedPartRdx, RdxPart);
   }
 
   if (VF > 1) {
@@ -4518,14 +4543,16 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(BI && "Unexpected terminator found");
 
   if (BI->isConditional()) {
-    VectorParts EdgeMask = getVectorValue(BI->getCondition());
 
-    if (BI->getSuccessor(0) != Dst)
-      for (unsigned part = 0; part < UF; ++part)
-        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+    VectorParts EdgeMask(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+      if (BI->getSuccessor(0) != Dst)
+        EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
 
-    for (unsigned part = 0; part < UF; ++part)
-      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+      EdgeMask[Part] = EdgeMaskPart;
+    }
 
     EdgeMaskCache[Edge] = EdgeMask;
     return EdgeMask;
@@ -4544,23 +4571,27 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   if (BCEntryIt != BlockMaskCache.end())
     return BCEntryIt->second;
 
+  VectorParts BlockMask(UF);
+
   // Loop incoming mask is all-one.
   if (OrigLoop->getHeader() == BB) {
     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
-    const VectorParts &BlockMask = getVectorValue(C);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = getOrCreateVectorValue(C, Part);
     BlockMaskCache[BB] = BlockMask;
     return BlockMask;
   }
 
   // This is the block mask. We OR all incoming edges, and with zero.
   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
-  VectorParts BlockMask = getVectorValue(Zero);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
 
   // For each pred:
-  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
-    VectorParts EM = createEdgeMask(*it, BB);
-    for (unsigned part = 0; part < UF; ++part)
-      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
+    VectorParts EM = createEdgeMask(*It, BB);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
   }
 
   BlockMaskCache[BB] = BlockMask;
@@ -4575,15 +4606,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy =
           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
-      Entry[part] = PHINode::Create(
+      Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
     }
-    VectorLoopValueMap.initVector(P, Entry);
     return;
   }
 
@@ -4607,21 +4637,22 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned In = 0; In < NumIncoming; In++) {
       VectorParts Cond =
           createEdgeMask(P->getIncomingBlock(In), P->getParent());
-      const VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
 
-      for (unsigned part = 0; part < UF; ++part) {
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
         // We might have single edge PHIs (blocks) - use an identity
         // 'select' for the first PHI operand.
         if (In == 0)
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
         else
           // Select between the current value and the previous incoming edge
           // based on the incoming mask.
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
                                              "predphi");
       }
     }
-    VectorLoopValueMap.initVector(P, Entry);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
     return;
   }
 
@@ -4652,18 +4683,15 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
-    ScalarParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Entry[Part].resize(VF);
       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
         SclrGep->setName("next.gep");
-        Entry[Part][Lane] = SclrGep;
+        VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
       }
     }
-    VectorLoopValueMap.initScalar(P, Entry);
     return;
   }
   }
@@ -4713,7 +4741,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // is vector-typed. Thus, to keep the representation compact, we only use
     // vector-typed operands for loop-varying values.
     auto *GEP = cast<GetElementPtrInst>(&I);
-    VectorParts Entry(UF);
 
     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
       // If we are vectorizing, but the GEP has only loop-invariant operands,
@@ -4729,8 +4756,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       //       collectLoopScalars() and teach getVectorValue() to broadcast
       //       the lane-zero scalar value.
       auto *Clone = Builder.Insert(GEP->clone());
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
+        addMetadata(EntryPart, GEP);
+      }
     } else {
       // If the GEP has at least one loop-varying operand, we are sure to
       // produce a vector of pointers. But if we are only unrolling, we want
@@ -4743,9 +4773,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
 
         // The pointer operand of the new GEP. If it's loop-invariant, we
         // won't broadcast it.
-        auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
-                        ? GEP->getPointerOperand()
-                        : getVectorValue(GEP->getPointerOperand())[Part];
+        auto *Ptr =
+            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                ? GEP->getPointerOperand()
+                : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
 
         // Collect all the indices for the new GEP. If any index is
         // loop-invariant, we won't broadcast it.
@@ -4754,7 +4785,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
           if (OrigLoop->isLoopInvariant(U.get()))
             Indices.push_back(U.get());
           else
-            Indices.push_back(getVectorValue(U.get())[Part]);
+            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
         }
 
         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
@@ -4764,12 +4795,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
                            : Builder.CreateGEP(Ptr, Indices);
         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
                "NewGEP is not a pointer vector");
-        Entry[Part] = NewGEP;
+        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
+        addMetadata(NewGEP, GEP);
       }
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, GEP);
     break;
   }
   case Instruction::UDiv:
@@ -4800,22 +4830,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // Just widen binops.
     auto *BinOp = cast<BinaryOperator>(&I);
     setDebugLocFromInst(Builder, BinOp);
-    const VectorParts &A = getVectorValue(BinOp->getOperand(0));
-    const VectorParts &B = getVectorValue(BinOp->getOperand(1));
 
-    // Use this vector value for all users of the original instruction.
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+      Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
 
       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
         VecOp->copyIRFlags(BinOp);
 
-      Entry[Part] = V;
+      // Use this vector value for all users of the original instruction.
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, BinOp);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, BinOp);
     break;
   }
   case Instruction::Select: {
@@ -4831,20 +4859,19 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // loop. This means that we can't just use the original 'cond' value.
     // We have to take the 'vectorized' value and pick the first lane.
     // Instcombine will make this a no-op.
-    const VectorParts &Cond = getVectorValue(I.getOperand(0));
-    const VectorParts &Op0 = getVectorValue(I.getOperand(1));
-    const VectorParts &Op1 = getVectorValue(I.getOperand(2));
 
-    auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
+    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
 
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Entry[Part] = Builder.CreateSelect(
-          InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
+      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
+      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
+      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
+      Value *Sel =
+          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
+      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+      addMetadata(Sel, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -4854,22 +4881,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     bool FCmp = (I.getOpcode() == Instruction::FCmp);
     auto *Cmp = dyn_cast<CmpInst>(&I);
     setDebugLocFromInst(Builder, Cmp);
-    const VectorParts &A = getVectorValue(Cmp->getOperand(0));
-    const VectorParts &B = getVectorValue(Cmp->getOperand(1));
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
       Value *C = nullptr;
       if (FCmp) {
-        C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
       } else {
-        C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
-      Entry[Part] = C;
+      VectorLoopValueMap.setVectorValue(&I, Part, C);
+      addMetadata(C, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -4906,12 +4931,12 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     Type *DestTy =
         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
-    const VectorParts &A = getVectorValue(CI->getOperand(0));
-    VectorParts Entry(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+      addMetadata(Cast, &I);
+    }
     break;
   }
 
@@ -4949,17 +4974,14 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       break;
     }
 
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
       SmallVector<Value *, 4> Args;
       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
         Value *Arg = CI->getArgOperand(i);
         // Some intrinsics have a scalar argument - don't replace it with a
         // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
-          const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
-          Arg = VectorArg[Part];
-        }
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
         Args.push_back(Arg);
       }
 
@@ -4992,11 +5014,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      Entry[Part] = V;
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -5363,7 +5384,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
           FirstOrderRecurrences.insert(Phi);
           continue;
         }
@@ -7636,6 +7658,15 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
 
   // 2. Copy and widen instructions from the old loop into the new loop.
 
+  // Move instructions to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
+  for (auto &Entry : SinkAfter) {
+    Entry.first->removeFromParent();
+    Entry.first->insertAfter(Entry.second);
+    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
+                 << " to vectorize a 1st order recurrence.\n");
+  }
+
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
@@ -7787,8 +7818,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Check the loop for a trip count threshold:
-  // do not vectorize loops with a tiny trip count.
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
+                                &Requirements, &Hints);
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
   unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
   bool HasExpectedTC = (ExpectedTC > 0);
 
@@ -7802,36 +7850,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                 << "This loop is not worth vectorizing.");
+                 << "This loop is worth vectorizing only if no scalar "
+                 << "iteration overheads are incurred.");
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
       DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
       DEBUG(dbgs() << "\n");
-      ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NotBeneficial", L)
-                << "vectorization is not beneficial "
-                   "and is not explicitly forced");
-      return false;
+      // Loops with a very small trip count are considered for vectorization
+      // under OptForSize, thereby making sure the cost of their loop body is
+      // dominant, free of runtime guards and scalar iteration overheads.
+      OptForSize = true;
     }
   }
 
-  PredicatedScalarEvolution PSE(*SE, *L);
-
-  // Check if it is legal to vectorize the loop.
-  LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
-                                &Requirements, &Hints);
-  if (!LVL.canVectorize()) {
-    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-    emitMissedWarning(F, L, Hints, ORE);
-    return false;
-  }
-
-  // Check the function attributes to find out if this function should be
-  // optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b267230d3185..b494526369d6 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -173,6 +173,11 @@ static unsigned getAltOpcode(unsigned Op) {
   }
 }
 
+/// true if the \p Value is odd, false otherwise.
+static bool isOdd(unsigned Value) {
+  return Value & 1;
+}
+
 ///\returns bool representing if Opcode \p Op can be part
 /// of an alternate sequence which can later be merged as
 /// a ShuffleVector instruction.
@@ -190,7 +195,7 @@ static unsigned isAltInst(ArrayRef<Value *> VL) {
   unsigned AltOpcode = getAltOpcode(Opcode);
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+    if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
       return 0;
   }
   return Instruction::ShuffleVector;
@@ -504,7 +509,7 @@ private:
     Last->NeedToGather = !Vectorized;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
@@ -521,6 +526,20 @@ private:
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
+  TreeEntry *getTreeEntry(Value *V) {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
+  const TreeEntry *getTreeEntry(Value *V) const {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
@@ -1048,14 +1067,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   for (TreeEntry &EIdx : VectorizableTree) {
     TreeEntry *Entry = &EIdx;
 
+    // No need to handle users of gathered values.
+    if (Entry->NeedToGather)
+      continue;
+
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
-      // No need to handle users of gathered values.
-      if (Entry->NeedToGather)
-        continue;
-
       // Check if the scalar is externally used as an extra arg.
       auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
@@ -1072,9 +1091,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (ScalarToTreeEntry.count(U)) {
-          int Idx = ScalarToTreeEntry[U];
-          TreeEntry *UseEntry = &VectorizableTree[Idx];
+        if (TreeEntry *UseEntry = getTreeEntry(U)) {
           Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
@@ -1083,7 +1100,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                          << ".\n");
-            assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+            assert(!UseEntry->NeedToGather && "Bad state");
             continue;
           }
         }
@@ -1156,9 +1173,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Check if this is a duplicate of another entry.
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0])) {
     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
@@ -1997,7 +2012,7 @@ int BoUpSLP::getSpillCost() {
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -2393,9 +2408,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (ScalarToTreeEntry.count(VL[i])) {
-        int Idx = ScalarToTreeEntry[VL[i]];
-        TreeEntry *E = &VectorizableTree[Idx];
+      if (TreeEntry *E = getTreeEntry(VL[i])) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
@@ -2415,11 +2428,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
 }
 
 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
-  SmallDenseMap<Value*, int>::const_iterator Entry
-    = ScalarToTreeEntry.find(VL[0]);
-  if (Entry != ScalarToTreeEntry.end()) {
-    int Idx = Entry->second;
-    const TreeEntry *En = &VectorizableTree[Idx];
+  if (const TreeEntry *En = getTreeEntry(VL[0])) {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -2427,12 +2436,9 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0]))
     if (E->isSame(VL))
       return vectorizeTree(E);
-  }
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -2667,9 +2673,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(LI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = LI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
@@ -2700,9 +2706,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(SI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = SI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       if (!Alignment) {
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
@@ -2783,7 +2789,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+      if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       E->VectorizedValue = V;
@@ -2819,7 +2825,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (i & 1) {
+        if (isOdd(i)) {
           Mask[i] = Builder.getInt32(e + i);
           OddScalars.push_back(E->Scalars[i]);
         } else {
@@ -2897,10 +2903,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     // has multiple uses of the same value.
     if (User && !is_contained(Scalar->users(), User))
       continue;
-    assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
-
-    int Idx = ScalarToTreeEntry[Scalar];
-    TreeEntry *E = &VectorizableTree[Idx];
+    TreeEntry *E = getTreeEntry(Scalar);
+    assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
@@ -2986,7 +2990,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
         for (User *U : Scalar->users()) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          assert((ScalarToTreeEntry.count(U) ||
+          assert((getTreeEntry(U) ||
                   // It is legal to replace users in the ignorelist by undef.
                   is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
@@ -3449,7 +3453,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
        I = I->getNextNode()) {
     ScheduleData *SD = BS->getScheduleData(I);
     assert(
-        SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+        SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
         "scheduler and vectorizer have different opinion on what is a bundle");
     SD->FirstInBundle->SchedulingPriority = Idx++;
     if (SD->isSchedulingEntity()) {
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index a21928317888..fb2f509dcbaa 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 /// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
-  initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
   initializeLoadStoreVectorizerPass(Registry);
@@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
   initializeVectorization(*unwrap(R));
 }
 
+// DEPRECATED: Remove after the LLVM 5 release.
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createBBVectorizePass());
 }
 
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {