vendor/llvm/llvm-release_40-r292009

author: Dimitry Andric <dim@FreeBSD.org> 2017-01-14 15:37:50 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-01-14 15:37:50 +0000
commit: 581a6d8501ff5614297da837b81ed3b6956361ea (patch)
tree: 985ee91d0ca1d3e6506ac5ff7e37f5b67adfec09
parent: 909545a822eef491158f831688066f0ec2866938 (diff)
485 files changed, 41265 insertions, 8554 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64b33f277572..24323e0a4920 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX svn)
+  set(LLVM_VERSION_SUFFIX "")
 endif()
 
 if (POLICY CMP0048)
diff --git a/LICENSE.TXT b/LICENSE.TXT
index 555c8bb952fc..ff63f2b6aae3 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -4,7 +4,7 @@ LLVM Release License
 University of Illinois/NCSA
 Open Source License
 
-Copyright (c) 2003-2016 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign.
 All rights reserved.
 
 Developed by:
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 56ba1479d7ee..fbe790b05b1a 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1007,6 +1007,7 @@ function(add_unittest test_suite test_name)
   endif()
 
   include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
+  include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include)
   if (NOT LLVM_ENABLE_THREADS)
     list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_PTHREAD=0)
   endif ()
diff --git a/cmake/modules/CheckLinkerFlag.cmake b/cmake/modules/CheckLinkerFlag.cmake
new file mode 100644
index 000000000000..e96d35e7721e
--- /dev/null
+++ b/cmake/modules/CheckLinkerFlag.cmake
@@ -0,0 +1,8 @@
+include(CheckCXXCompilerFlag)
+
+function(check_linker_flag flag out_var)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${flag}")
+  check_cxx_compiler_flag("" ${out_var})
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+endfunction()
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 89d90befc816..4ce7f57403c4 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -597,6 +597,14 @@ if (UNIX AND
   append("-fcolor-diagnostics" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 endif()
 
+# lld doesn't print colored diagnostics when invoked from Ninja
+if (UNIX AND CMAKE_GENERATOR STREQUAL "Ninja")
+  include(CheckLinkerFlag)
+  check_linker_flag("-Wl,-color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
+  append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,-color-diagnostics"
+    CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+endif()
+
 # Add flags for add_dead_strip().
 # FIXME: With MSVS, consider compiling with /Gy and linking with /OPT:REF?
 # But MinSizeRel seems to add that automatically, so maybe disable these
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index ecf37bab55d0..5ac17015953e 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2169,8 +2169,9 @@ Fast-Math Flags
 
 LLVM IR floating-point binary ops (:ref:`fadd <i_fadd>`,
 :ref:`fsub <i_fsub>`, :ref:`fmul <i_fmul>`, :ref:`fdiv <i_fdiv>`,
-:ref:`frem <i_frem>`, :ref:`fcmp <i_fcmp>`) have the following flags that can
-be set to enable otherwise unsafe floating point operations
+:ref:`frem <i_frem>`, :ref:`fcmp <i_fcmp>`) and :ref:`call <i_call>`
+instructions have the following flags that can be set to enable
+otherwise unsafe floating point transformations.
 
 ``nnan``
    No NaNs - Allow optimizations to assume the arguments and result are not
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 81db88289153..b92527dbb296 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -26,11 +26,6 @@ have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
-Note that if you are reading this file from a Subversion checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <http://llvm.org/releases/>`_.
-
 Non-comprehensive list of changes in this release
 =================================================
 * The C API functions LLVMAddFunctionAttr, LLVMGetFunctionAttr,
@@ -57,6 +52,9 @@ Non-comprehensive list of changes in this release
   the previously used names should become descriptions and a short name in the
   style of a programming language identifier should be added.
 
+* LLVM now handles invariant.group across different basic blocks, which makes
+  it possible to devirtualize virtual calls inside loops.
+
 * ... next change ...
 
 .. NOTE
diff --git a/include/llvm/ADT/PointerSumType.h b/include/llvm/ADT/PointerSumType.h
index 005b1c645c93..062544eedf84 100644
--- a/include/llvm/ADT/PointerSumType.h
+++ b/include/llvm/ADT/PointerSumType.h
@@ -94,7 +94,7 @@ public:
     return HelperT::template Lookup<N>::TraitsT::getFromVoidPointer(getImpl());
   }
 
-  operator bool() const { return Value & HelperT::PointerMask; }
+  explicit operator bool() const { return Value & HelperT::PointerMask; }
   bool operator==(const PointerSumType &R) const { return Value == R.Value; }
   bool operator!=(const PointerSumType &R) const { return Value != R.Value; }
   bool operator<(const PointerSumType &R) const { return Value < R.Value; }
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index 9ccacc10db0d..6470e09db86c 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -33,6 +33,32 @@ namespace llvm {
 /// Another abstraction that this doesn't provide is implementing increment in
 /// terms of addition of one. These aren't equivalent for all iterator
 /// categories, and respecting that adds a lot of complexity for little gain.
+///
+/// Classes wishing to use `iterator_facade_base` should implement the following
+/// methods:
+///
+/// Forward Iterators:
+///   (All of the following methods)
+///   - DerivedT &operator=(const DerivedT &R);
+///   - bool operator==(const DerivedT &R) const;
+///   - const T &operator*() const;
+///   - T &operator*();
+///   - DerivedT &operator++();
+///
+/// Bidirectional Iterators:
+///   (All methods of forward iterators, plus the following)
+///   - DerivedT &operator--();
+///
+/// Random-access Iterators:
+///   (All methods of bidirectional iterators excluding the following)
+///   - DerivedT &operator++();
+///   - DerivedT &operator--();
+///   (and plus the following)
+///   - bool operator<(const DerivedT &RHS) const;
+///   - DifferenceTypeT operator-(const DerivedT &R) const;
+///   - DerivedT &operator+=(DifferenceTypeT N);
+///   - DerivedT &operator-=(DifferenceTypeT N);
+///
 template <typename DerivedT, typename IteratorCategoryT, typename T,
           typename DifferenceTypeT = std::ptrdiff_t, typename PointerT = T *,
           typename ReferenceT = T &>
diff --git a/include/llvm/Analysis/AssumptionCache.h b/include/llvm/Analysis/AssumptionCache.h
index 406a1fe9f560..b50545a0484b 100644
--- a/include/llvm/Analysis/AssumptionCache.h
+++ b/include/llvm/Analysis/AssumptionCache.h
@@ -46,6 +46,30 @@ class AssumptionCache {
   /// intrinsic.
   SmallVector<WeakVH, 4> AssumeHandles;
 
+  class AffectedValueCallbackVH final : public CallbackVH {
+    AssumptionCache *AC;
+    void deleted() override;
+    void allUsesReplacedWith(Value *) override;
+
+  public:
+    using DMI = DenseMapInfo<Value *>;
+
+    AffectedValueCallbackVH(Value *V, AssumptionCache *AC = nullptr)
+        : CallbackVH(V), AC(AC) {}
+  };
+
+  friend AffectedValueCallbackVH;
+
+  /// \brief A map of values about which an assumption might be providing
+  /// information to the relevant set of assumptions.
+  using AffectedValuesMap =
+    DenseMap<AffectedValueCallbackVH, SmallVector<WeakVH, 1>,
+             AffectedValueCallbackVH::DMI>;
+  AffectedValuesMap AffectedValues;
+
+  /// Get the vector of assumptions which affect a value from the cache.
+  SmallVector<WeakVH, 1> &getAffectedValues(Value *V);
+
   /// \brief Flag tracking whether we have scanned the function yet.
   ///
   /// We want to be as lazy about this as possible, and so we scan the function
@@ -66,11 +90,16 @@ public:
   /// not already be in the cache.
   void registerAssumption(CallInst *CI);
 
+  /// \brief Update the cache of values being affected by this assumption (i.e.
+  /// the values about which this assumption provides information).
+  void updateAffectedValues(CallInst *CI);
+
   /// \brief Clear the cache of @llvm.assume intrinsics for a function.
   ///
   /// It will be re-scanned the next time it is requested.
   void clear() {
     AssumeHandles.clear();
+    AffectedValues.clear();
     Scanned = false;
   }
 
@@ -87,6 +116,18 @@ public:
       scanFunction();
     return AssumeHandles;
   }
+
+  /// \brief Access the list of assumptions which affect this value.
+  MutableArrayRef<WeakVH> assumptionsFor(const Value *V) {
+    if (!Scanned)
+      scanFunction();
+
+    auto AVI = AffectedValues.find_as(const_cast<Value *>(V));
+    if (AVI == AffectedValues.end())
+      return MutableArrayRef<WeakVH>();
+
+    return AVI->second;
+  }
 };
 
 /// \brief A function analysis which provides an \c AssumptionCache.
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index e1a5467d8b63..bb572dd5603b 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_ANALYSIS_IVUSERS_H
 #define LLVM_ANALYSIS_IVUSERS_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/IR/ValueHandle.h"
 
@@ -193,17 +193,10 @@ class IVUsersAnalysis : public AnalysisInfoMixin<IVUsersAnalysis> {
 public:
   typedef IVUsers Result;
 
-  IVUsers run(Loop &L, LoopAnalysisManager &AM);
+  IVUsers run(Loop &L, LoopAnalysisManager &AM,
+              LoopStandardAnalysisResults &AR);
 };
 
-/// Printer pass for the \c IVUsers for a loop.
-class IVUsersPrinterPass : public PassInfoMixin<IVUsersPrinterPass> {
-  raw_ostream &OS;
-
-public:
-  explicit IVUsersPrinterPass(raw_ostream &OS) : OS(OS) {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
-};
 }
 
 #endif
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 566e526f89b3..bca0aebe2eef 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -148,7 +148,7 @@ public:
     ///
     /// This happens when an edge has been deleted. We leave the edge objects
     /// around but clear them.
-    operator bool() const;
+    explicit operator bool() const;
 
     /// Returnss the \c Kind of the edge.
     Kind getKind() const;
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 76066f6003e7..901b193c7e2d 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -20,7 +20,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/ValueHandle.h"
@@ -753,18 +753,8 @@ class LoopAccessAnalysis
 
 public:
   typedef LoopAccessInfo Result;
-  Result run(Loop &, LoopAnalysisManager &);
-  static StringRef name() { return "LoopAccessAnalysis"; }
-};
-
-/// \brief Printer pass for the \c LoopAccessInfo results.
-class LoopAccessInfoPrinterPass
-    : public PassInfoMixin<LoopAccessInfoPrinterPass> {
-  raw_ostream &OS;
 
-public:
-  explicit LoopAccessInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR);
 };
 
 inline Instruction *MemoryDepChecker::Dependence::getSource(
diff --git a/include/llvm/Analysis/LoopAnalysisManager.h b/include/llvm/Analysis/LoopAnalysisManager.h
new file mode 100644
index 000000000000..17da516889b0
--- /dev/null
+++ b/include/llvm/Analysis/LoopAnalysisManager.h
@@ -0,0 +1,155 @@
+//===- LoopAnalysisManager.h - Loop analysis management ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header provides classes for managing per-loop analyses. These are
+/// typically used as part of a loop pass pipeline over the loop nests of
+/// a function.
+///
+/// Loop analyses are allowed to make some simplifying assumptions:
+/// 1) Loops are, where possible, in simplified form.
+/// 2) Loops are *always* in LCSSA form.
+/// 3) A collection of analysis results are available:
+///    - LoopInfo
+///    - DominatorTree
+///    - ScalarEvolution
+///    - AAManager
+///
+/// The primary mechanism to provide these invariants is the loop pass manager,
+/// but they can also be manually provided in order to reason about a loop from
+/// outside of a dedicated pass manager.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LOOPANALYSISMANAGER_H
+#define LLVM_ANALYSIS_LOOPANALYSISMANAGER_H
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// The adaptor from a function pass to a loop pass computes these analyses and
+/// makes them available to the loop passes "for free". Each loop pass is
+/// expected expected to update these analyses if necessary to ensure they're
+/// valid after it runs.
+struct LoopStandardAnalysisResults {
+  AAResults &AA;
+  AssumptionCache &AC;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+  TargetLibraryInfo &TLI;
+  TargetTransformInfo &TTI;
+};
+
+/// Extern template declaration for the analysis set for this IR unit.
+extern template class AllAnalysesOn<Loop>;
+
+extern template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
+/// \brief The loop analysis manager.
+///
+/// See the documentation for the AnalysisManager template for detail
+/// documentation. This typedef serves as a convenient way to refer to this
+/// construct in the adaptors and proxies used to integrate this into the larger
+/// pass manager infrastructure.
+typedef AnalysisManager<Loop, LoopStandardAnalysisResults &>
+    LoopAnalysisManager;
+
+/// A proxy from a \c LoopAnalysisManager to a \c Function.
+typedef InnerAnalysisManagerProxy<LoopAnalysisManager, Function>
+    LoopAnalysisManagerFunctionProxy;
+
+/// A specialized result for the \c LoopAnalysisManagerFunctionProxy which
+/// retains a \c LoopInfo reference.
+///
+/// This allows it to collect loop objects for which analysis results may be
+/// cached in the \c LoopAnalysisManager.
+template <> class LoopAnalysisManagerFunctionProxy::Result {
+public:
+  explicit Result(LoopAnalysisManager &InnerAM, LoopInfo &LI)
+      : InnerAM(&InnerAM), LI(&LI) {}
+  Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)), LI(Arg.LI) {
+    // We have to null out the analysis manager in the moved-from state
+    // because we are taking ownership of the responsibilty to clear the
+    // analysis state.
+    Arg.InnerAM = nullptr;
+  }
+  Result &operator=(Result &&RHS) {
+    InnerAM = RHS.InnerAM;
+    LI = RHS.LI;
+    // We have to null out the analysis manager in the moved-from state
+    // because we are taking ownership of the responsibilty to clear the
+    // analysis state.
+    RHS.InnerAM = nullptr;
+    return *this;
+  }
+  ~Result() {
+    // InnerAM is cleared in a moved from state where there is nothing to do.
+    if (!InnerAM)
+      return;
+
+    // Clear out the analysis manager if we're being destroyed -- it means we
+    // didn't even see an invalidate call when we got invalidated.
+    InnerAM->clear();
+  }
+
+  /// Accessor for the analysis manager.
+  LoopAnalysisManager &getManager() { return *InnerAM; }
+
+  /// Handler for invalidation of the proxy for a particular function.
+  ///
+  /// If the proxy, \c LoopInfo, and associated analyses are preserved, this
+  /// will merely forward the invalidation event to any cached loop analysis
+  /// results for loops within this function.
+  ///
+  /// If the necessary loop infrastructure is not preserved, this will forcibly
+  /// clear all of the cached analysis results that are keyed on the \c
+  /// LoopInfo for this function.
+  bool invalidate(Function &F, const PreservedAnalyses &PA,
+                  FunctionAnalysisManager::Invalidator &Inv);
+
+private:
+  LoopAnalysisManager *InnerAM;
+  LoopInfo *LI;
+};
+
+/// Provide a specialized run method for the \c LoopAnalysisManagerFunctionProxy
+/// so it can pass the \c LoopInfo to the result.
+template <>
+LoopAnalysisManagerFunctionProxy::Result
+LoopAnalysisManagerFunctionProxy::run(Function &F, FunctionAnalysisManager &AM);
+
+// Ensure the \c LoopAnalysisManagerFunctionProxy is provided as an extern
+// template.
+extern template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
+
+extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                                LoopStandardAnalysisResults &>;
+/// A proxy from a \c FunctionAnalysisManager to a \c Loop.
+typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                  LoopStandardAnalysisResults &>
+    FunctionAnalysisManagerLoopProxy;
+
+/// Returns the minimum set of Analyses that all loop passes must preserve.
+PreservedAnalyses getLoopPassPreservedAnalyses();
+}
+
+#endif // LLVM_ANALYSIS_LOOPANALYSISMANAGER_H
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 0c99c6297c1e..20e6af2727fe 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -853,17 +853,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
-/// \brief Pass for printing a loop's contents as LLVM's text IR assembly.
-class PrintLoopPass : public PassInfoMixin<PrintLoopPass> {
-  raw_ostream &OS;
-  std::string Banner;
-
-public:
-  PrintLoopPass();
-  PrintLoopPass(raw_ostream &OS, const std::string &Banner = "");
-
-  PreservedAnalyses run(Loop &L, AnalysisManager<Loop> &);
-};
+/// Function to print a loop's contents as LLVM's text IR assembly.
+void printLoop(Loop &L, raw_ostream &OS, const std::string &Banner = "");
 
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/LoopPassManager.h b/include/llvm/Analysis/LoopPassManager.h
deleted file mode 100644
index ae9c16502feb..000000000000
--- a/include/llvm/Analysis/LoopPassManager.h
+++ /dev/null
@@ -1,149 +0,0 @@
-//===- LoopPassManager.h - Loop pass management -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This header provides classes for managing passes over loops in LLVM IR.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_LOOPPASSMANAGER_H
-#define LLVM_ANALYSIS_LOOPPASSMANAGER_H
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-extern template class PassManager<Loop>;
-/// \brief The loop pass manager.
-///
-/// See the documentation for the PassManager template for details. It runs a
-/// sequency of loop passes over each loop that the manager is run over. This
-/// typedef serves as a convenient way to refer to this construct.
-typedef PassManager<Loop> LoopPassManager;
-
-extern template class AnalysisManager<Loop>;
-/// \brief The loop analysis manager.
-///
-/// See the documentation for the AnalysisManager template for detail
-/// documentation. This typedef serves as a convenient way to refer to this
-/// construct in the adaptors and proxies used to integrate this into the larger
-/// pass manager infrastructure.
-typedef AnalysisManager<Loop> LoopAnalysisManager;
-
-/// A proxy from a \c LoopAnalysisManager to a \c Function.
-typedef InnerAnalysisManagerProxy<LoopAnalysisManager, Function>
-    LoopAnalysisManagerFunctionProxy;
-
-/// Specialization of the invalidate method for the \c
-/// LoopAnalysisManagerFunctionProxy's result.
-template <>
-bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
-    Function &F, const PreservedAnalyses &PA,
-    FunctionAnalysisManager::Invalidator &Inv);
-
-// Ensure the \c LoopAnalysisManagerFunctionProxy is provided as an extern
-// template.
-extern template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
-
-extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
-/// A proxy from a \c FunctionAnalysisManager to a \c Loop.
-typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>
-    FunctionAnalysisManagerLoopProxy;
-
-/// Returns the minimum set of Analyses that all loop passes must preserve.
-PreservedAnalyses getLoopPassPreservedAnalyses();
-
-/// \brief Adaptor that maps from a function to its loops.
-///
-/// Designed to allow composition of a LoopPass(Manager) and a
-/// FunctionPassManager. Note that if this pass is constructed with a \c
-/// FunctionAnalysisManager it will run the \c LoopAnalysisManagerFunctionProxy
-/// analysis prior to running the loop passes over the function to enable a \c
-/// LoopAnalysisManager to be used within this run safely.
-template <typename LoopPassT>
-class FunctionToLoopPassAdaptor
-    : public PassInfoMixin<FunctionToLoopPassAdaptor<LoopPassT>> {
-public:
-  explicit FunctionToLoopPassAdaptor(LoopPassT Pass)
-      : Pass(std::move(Pass)) {}
-
-  /// \brief Runs the loop passes across every loop in the function.
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
-    // Setup the loop analysis manager from its proxy.
-    LoopAnalysisManager &LAM =
-        AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
-    // Get the loop structure for this function
-    LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
-
-    // Also precompute all of the function analyses used by loop passes.
-    // FIXME: These should be handed into the loop passes when the loop pass
-    // management layer is reworked to follow the design of CGSCC.
-    (void)AM.getResult<AAManager>(F);
-    (void)AM.getResult<DominatorTreeAnalysis>(F);
-    (void)AM.getResult<ScalarEvolutionAnalysis>(F);
-    (void)AM.getResult<TargetLibraryAnalysis>(F);
-
-    PreservedAnalyses PA = PreservedAnalyses::all();
-
-    // We want to visit the loops in reverse post-order. We'll build the stack
-    // of loops to visit in Loops by first walking the loops in pre-order.
-    SmallVector<Loop *, 2> Loops;
-    SmallVector<Loop *, 2> WorkList(LI.begin(), LI.end());
-    while (!WorkList.empty()) {
-      Loop *L = WorkList.pop_back_val();
-      WorkList.insert(WorkList.end(), L->begin(), L->end());
-      Loops.push_back(L);
-    }
-
-    // Now pop each element off of the stack to visit the loops in reverse
-    // post-order.
-    for (auto *L : reverse(Loops)) {
-      PreservedAnalyses PassPA = Pass.run(*L, LAM);
-      // FIXME: We should verify the set of analyses relevant to Loop passes
-      // are preserved.
-
-      // We know that the loop pass couldn't have invalidated any other loop's
-      // analyses (that's the contract of a loop pass), so directly handle the
-      // loop analysis manager's invalidation here.
-      LAM.invalidate(*L, PassPA);
-
-      // Then intersect the preserved set so that invalidation of module
-      // analyses will eventually occur when the module pass completes.
-      PA.intersect(std::move(PassPA));
-    }
-
-    // By definition we preserve the proxy. We also preserve all analyses on
-    // Loops. This precludes *any* invalidation of loop analyses by the proxy,
-    // but that's OK because we've taken care to invalidate analyses in the
-    // loop analysis manager incrementally above.
-    PA.preserveSet<AllAnalysesOn<Loop>>();
-    PA.preserve<LoopAnalysisManagerFunctionProxy>();
-    return PA;
-  }
-
-private:
-  LoopPassT Pass;
-};
-
-/// \brief A function to deduce a loop pass type and wrap it in the templated
-/// adaptor.
-template <typename LoopPassT>
-FunctionToLoopPassAdaptor<LoopPassT>
-createFunctionToLoopPassAdaptor(LoopPassT Pass) {
-  return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass));
-}
-}
-
-#endif // LLVM_ANALYSIS_LOOPPASSMANAGER_H
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 33dbd22f7a20..a401887016c9 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -302,6 +302,10 @@ private:
     NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {}
   };
 
+  /// Cache storing single nonlocal def for the instruction.
+  /// It is set when nonlocal def would be found in function returning only
+  /// local dependencies.
+  DenseMap<Instruction *, NonLocalDepResult> NonLocalDefsCache;
   /// This map stores the cached results of doing a pointer lookup at the
   /// bottom of a block.
   ///
@@ -441,9 +445,9 @@ public:
   /// This analysis looks for other loads and stores with invariant.group
   /// metadata and the same pointer operand. Returns Unknown if it does not
   /// find anything, and Def if it can be assumed that 2 instructions load or
-  /// store the same value.
-  /// FIXME: This analysis works only on single block because of restrictions
-  /// at the call site.
+  /// store the same value and NonLocal which indicate that non-local Def was
+  /// found, which can be retrieved by calling getNonLocalPointerDependency
+  /// with the same queried instruction.
   MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB);
 
   /// Looks at a memory location for a load (specified by MemLocBase, Offs, and
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index b4a6c5c2fae0..209f05c279d0 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -55,6 +55,11 @@ struct MemIntrinsicInfo {
   // Same Id is set by the target for corresponding load/store intrinsics.
   unsigned short MatchingId;
   int NumMemRefs;
+
+  /// This is the pointer that the intrinsic is loading from or storing to.
+  /// If this is non-null, then analysis/optimization passes can assume that
+  /// this intrinsic is functionally equivalent to a load/store from this
+  /// pointer.
   Value *PtrVal;
 };
 
@@ -518,11 +523,15 @@ public:
   unsigned getMaxInterleaveFactor(unsigned VF) const;
 
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+  /// \p Args is an optional argument which holds the instruction operands  
+  /// values so the TTI can analyize those values searching for special 
+  /// cases\optimizations based on those values.
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
       OperandValueKind Opd2Info = OK_AnyValue,
       OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const;
+      OperandValueProperties Opd2PropInfo = OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) const;
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -763,7 +772,8 @@ public:
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
-                         OperandValueProperties Opd2PropInfo) = 0;
+                         OperandValueProperties Opd2PropInfo,
+                         ArrayRef<const Value *> Args) = 0;
   virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                              Type *SubTp) = 0;
   virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
@@ -984,9 +994,10 @@ public:
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
-                         OperandValueProperties Opd2PropInfo) override {
+                         OperandValueProperties Opd2PropInfo,
+                         ArrayRef<const Value *> Args) override {
     return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+                                       Opd1PropInfo, Opd2PropInfo, Args);
   }
   int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                      Type *SubTp) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d7edbaf7df0..cafc40723c9d 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -306,7 +306,8 @@ public:
                                   TTI::OperandValueKind Opd1Info,
                                   TTI::OperandValueKind Opd2Info,
                                   TTI::OperandValueProperties Opd1PropInfo,
-                                  TTI::OperandValueProperties Opd2PropInfo) {
+                                  TTI::OperandValueProperties Opd2PropInfo,
+                                  ArrayRef<const Value *> Args) {
     return 1;
   }
 
@@ -427,6 +428,63 @@ public:
     return VF;
   }
 protected:
+  // Obtain the minimum required size to hold the value (without the sign)
+  // In case of a vector it returns the min required size for one element.
+  unsigned minRequiredElementSize(const Value* Val, bool &isSigned) {
+    if (isa<ConstantDataVector>(Val) || isa<ConstantVector>(Val)) {
+      const auto* VectorValue = cast<Constant>(Val);
+
+      // In case of a vector need to pick the max between the min
+      // required size for each element
+      auto *VT = cast<VectorType>(Val->getType());
+
+      // Assume unsigned elements
+      isSigned = false;
+
+      // The max required size is the total vector width divided by num
+      // of elements in the vector
+      unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements();
+
+      unsigned MinRequiredSize = 0;
+      for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) {
+        if (auto* IntElement =
+              dyn_cast<ConstantInt>(VectorValue->getAggregateElement(i))) {
+          bool signedElement = IntElement->getValue().isNegative();
+          // Get the element min required size.
+          unsigned ElementMinRequiredSize =
+            IntElement->getValue().getMinSignedBits() - 1;
+          // In case one element is signed then all the vector is signed.
+          isSigned |= signedElement;
+          // Save the max required bit size between all the elements.
+          MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize);
+        }
+        else {
+          // not an int constant element
+          return MaxRequiredSize;
+        }
+      }
+      return MinRequiredSize;
+    }
+
+    if (const auto* CI = dyn_cast<ConstantInt>(Val)) {
+      isSigned = CI->getValue().isNegative();
+      return CI->getValue().getMinSignedBits() - 1;
+    }
+
+    if (const auto* Cast = dyn_cast<SExtInst>(Val)) {
+      isSigned = true;
+      return Cast->getSrcTy()->getScalarSizeInBits() - 1;
+    }
+
+    if (const auto* Cast = dyn_cast<ZExtInst>(Val)) {
+      isSigned = false;
+      return Cast->getSrcTy()->getScalarSizeInBits();
+    }
+
+    isSigned = false;
+    return Val->getType()->getScalarSizeInBits();
+  }
+
   bool isStridedAccess(const SCEV *Ptr) {
     return Ptr && isa<SCEVAddRecExpr>(Ptr);
   }
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index dd767217345a..aaf6f888e06f 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -169,8 +169,12 @@ template <typename T> class ArrayRef;
 
   /// Return true if we can prove that the specified FP value is either a NaN or
   /// never less than 0.0.
-  bool CannotBeOrderedLessThanZero(const Value *V, const TargetLibraryInfo *TLI,
-                                   unsigned Depth = 0);
+  /// If \p IncludeNeg0 is false, -0.0 is considered less than 0.0.
+  bool CannotBeOrderedLessThanZero(const Value *V, const TargetLibraryInfo *TLI);
+
+  /// \returns true if we can prove that the specified FP value has a 0 sign
+  /// bit.
+  bool SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI);
 
   /// If the specified value can be set by repeating the same byte in memory,
   /// return the i8 value that it is represented with. This is true for all i8
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 8e96336b981f..7efdbcccdef5 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -308,7 +308,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
     // Check if any of the operands are vector operands.
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index 09c3bf6a1b56..95c4b4248bbd 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -52,13 +52,20 @@ class DIEAbbrevData {
   /// Dwarf form code.
   dwarf::Form Form;
 
+  /// Dwarf attribute value for DW_FORM_implicit_const
+  int64_t Value;
+
 public:
-  DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
+  DIEAbbrevData(dwarf::Attribute A, dwarf::Form F)
+      : Attribute(A), Form(F), Value(0) {}
+  DIEAbbrevData(dwarf::Attribute A, int64_t V)
+      : Attribute(A), Form(dwarf::DW_FORM_implicit_const), Value(V) {}
 
   /// Accessors.
   /// @{
   dwarf::Attribute getAttribute() const { return Attribute; }
   dwarf::Form getForm() const { return Form; }
+  int64_t getValue() const { return Value; }
   /// @}
 
   /// Used to gather unique data for the abbreviation folding set.
@@ -102,6 +109,11 @@ public:
     Data.push_back(DIEAbbrevData(Attribute, Form));
   }
 
+  /// Adds attribute with DW_FORM_implicit_const value
+  void AddImplicitConstAttribute(dwarf::Attribute Attribute, int64_t Value) {
+    Data.push_back(DIEAbbrevData(Attribute, Value));
+  }
+
   /// Used to gather unique data for the abbreviation folding set.
   void Profile(FoldingSetNodeID &ID) const;
 
diff --git a/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index 106fc9ffb8b5..b331533cd7fb 100644
--- a/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -76,6 +76,7 @@ class MachineBlockFrequencyInfo;
 class MachineRegisterInfo;
 class TargetPassConfig;
 class TargetRegisterInfo;
+class raw_ostream;
 
 /// This pass implements the reg bank selector pass used in the GlobalISel
 /// pipeline. At the end of this pass, all register operands have been assigned
@@ -450,6 +451,18 @@ private:
     bool operator>(const MappingCost &Cost) const {
       return *this != Cost && Cost < *this;
     }
+
+    /// Print this on dbgs() stream.
+    void dump() const;
+
+    /// Print this on \p OS;
+    void print(raw_ostream &OS) const;
+
+    /// Overload the stream operator for easy debug printing.
+    friend raw_ostream &operator<<(raw_ostream &OS, const MappingCost &Cost) {
+      Cost.print(OS);
+      return OS;
+    }
   };
 
   /// Interface to the target lowering info related
@@ -626,6 +639,7 @@ public:
   /// \endcode
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
+
 } // End namespace llvm.
 
 #endif
diff --git a/include/llvm/CodeGen/GlobalISel/RegisterBank.h b/include/llvm/CodeGen/GlobalISel/RegisterBank.h
index 075677d30179..b05bf9948243 100644
--- a/include/llvm/CodeGen/GlobalISel/RegisterBank.h
+++ b/include/llvm/CodeGen/GlobalISel/RegisterBank.h
@@ -41,11 +41,8 @@ private:
   friend RegisterBankInfo;
 
 public:
-  /// The default constructor will leave the object in
-  /// an invalid state. I.e. isValid() == false.
-  /// The fields must be updated to fix that and only
-  /// RegisterBankInfo instances are allowed to do that
-  RegisterBank();
+  RegisterBank(unsigned ID, const char *Name, unsigned Size,
+               const uint32_t *ContainedRegClasses);
 
   /// Get the identifier of this register bank.
   unsigned getID() const { return ID; }
diff --git a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
index 4d4a226eb2d2..312dc9314d45 100644
--- a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h
@@ -384,10 +384,6 @@ protected:
 
   /// Create a RegisterBankInfo that can accomodate up to \p NumRegBanks
   /// RegisterBank instances.
-  ///
-  /// \note For the verify method to succeed all the \p NumRegBanks
-  /// must be initialized by createRegisterBank and updated with
-  /// addRegBankCoverage RegisterBank.
   RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks);
 
   /// This constructor is meaningless.
@@ -400,31 +396,6 @@ protected:
     llvm_unreachable("This constructor should not be executed");
   }
 
-  /// Create a new register bank with the given parameter and add it
-  /// to RegBanks.
-  /// \pre \p ID must not already be used.
-  /// \pre \p ID < NumRegBanks.
-  void createRegisterBank(unsigned ID, const char *Name);
-
-  /// Add \p RCId to the set of register class that the register bank,
-  /// identified \p ID, covers.
-  /// This method transitively adds all the sub classes and the subreg-classes
-  /// of \p RCId to the set of covered register classes.
-  /// It also adjusts the size of the register bank to reflect the maximal
-  /// size of a value that can be hold into that register bank.
-  ///
-  /// \note This method does *not* add the super classes of \p RCId.
-  /// The rationale is if \p ID covers the registers of \p RCId, that
-  /// does not necessarily mean that \p ID covers the set of registers
-  /// of RCId's superclasses.
-  /// This method does *not* add the superreg classes as well for consistents.
-  /// The expected use is to add the coverage top-down with respect to the
-  /// register hierarchy.
-  ///
-  /// \todo TableGen should just generate the BitSet vector for us.
-  void addRegBankCoverage(unsigned ID, unsigned RCId,
-                          const TargetRegisterInfo &TRI);
-
   /// Get the register bank identified by \p ID.
   RegisterBank &getRegBank(unsigned ID) {
     assert(ID < getNumRegBanks() && "Accessing an unknown register bank");
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 420b03ec02bd..df700bf0c53d 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -503,19 +503,6 @@ namespace ISD {
     /// address spaces.
     ADDRSPACECAST,
 
-    /// CONVERT_RNDSAT - This operator is used to support various conversions
-    /// between various types (float, signed, unsigned and vectors of those
-    /// types) with rounding and saturation. NOTE: Avoid using this operator as
-    /// most target don't support it and the operator might be removed in the
-    /// future. It takes the following arguments:
-    ///   0) value
-    ///   1) dest type (type to convert to)
-    ///   2) src type (type to convert from)
-    ///   3) rounding imm
-    ///   4) saturation imm
-    ///   5) ISD::CvtCode indicating the type of conversion to do
-    CONVERT_RNDSAT,
-
     /// FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions
     /// and truncation for half-precision (16 bit) floating numbers. These nodes
     /// form a semi-softened interface for dealing with f16 (as an i16), which
@@ -927,21 +914,6 @@ namespace ISD {
   /// SETCC_INVALID if it is not possible to represent the resultant comparison.
   CondCode getSetCCAndOperation(CondCode Op1, CondCode Op2, bool isInteger);
 
-  //===--------------------------------------------------------------------===//
-  /// This enum defines the various converts CONVERT_RNDSAT supports.
-  enum CvtCode {
-    CVT_FF,     /// Float from Float
-    CVT_FS,     /// Float from Signed
-    CVT_FU,     /// Float from Unsigned
-    CVT_SF,     /// Signed from Float
-    CVT_UF,     /// Unsigned from Float
-    CVT_SS,     /// Signed from Signed
-    CVT_SU,     /// Signed from Unsigned
-    CVT_US,     /// Unsigned from Signed
-    CVT_UU,     /// Unsigned from Unsigned
-    CVT_INVALID /// Marker - Invalid opcode
-  };
-
 } // end llvm::ISD namespace
 
 } // end llvm namespace
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 7927982e782d..54d0436e4ab8 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -626,12 +626,6 @@ public:
 
   SDValue getCondCode(ISD::CondCode Cond);
 
-  /// Returns the ConvertRndSat Note: Avoid using this node because it may
-  /// disappear in the future and most targets don't support it.
-  SDValue getConvertRndSat(EVT VT, const SDLoc &dl, SDValue Val, SDValue DTy,
-                           SDValue STy, SDValue Rnd, SDValue Sat,
-                           ISD::CvtCode Code);
-
   /// Return an ISD::VECTOR_SHUFFLE node. The number of elements in VT,
   /// which must be a vector type, must match the number of mask elements
   /// NumElts. An integer mask element equal to -1 is treated as undefined.
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index d4b7170eac3c..b6f5424dbbd7 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1860,26 +1860,6 @@ public:
   }
 };
 
-/// NOTE: avoid using this node as this may disappear in the
-/// future and most targets don't support it.
-class CvtRndSatSDNode : public SDNode {
-  ISD::CvtCode CvtCode;
-
-  friend class SelectionDAG;
-
-  explicit CvtRndSatSDNode(EVT VT, unsigned Order, const DebugLoc &dl,
-                           ISD::CvtCode Code)
-      : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT)), CvtCode(Code) {
-  }
-
-public:
-  ISD::CvtCode getCvtCode() const { return CvtCode; }
-
-  static bool classof(const SDNode *N) {
-    return N->getOpcode() == ISD::CONVERT_RNDSAT;
-  }
-};
-
 /// This class is used to represent EVT's, which are used
 /// to parameterize some operations.
 class VTSDNode : public SDNode {
@@ -2041,7 +2021,7 @@ public:
   friend class SelectionDAG;
 
   MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                    bool isTrunc, bool isCompressing, EVT MemVT, 
+                    bool isTrunc, bool isCompressing, EVT MemVT,
                     MachineMemOperand *MMO)
       : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, MemVT, MMO) {
     StoreSDNodeBits.IsTruncating = isTrunc;
@@ -2054,8 +2034,8 @@ public:
   bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
 
   /// Returns true if the op does a compression to the vector before storing.
-  /// The node contiguously stores the active elements (integers or floats) 
-  /// in src (those with their respective bit set in writemask k) to unaligned 
+  /// The node contiguously stores the active elements (integers or floats)
+  /// in src (those with their respective bit set in writemask k) to unaligned
   /// memory at base_addr.
   bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
 
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeDumper.h b/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
new file mode 100644
index 000000000000..e1dd6a10b5a1
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
@@ -0,0 +1,56 @@
+//===-- CVTypeDumper.h - CodeView type info dumper --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_CVTYPEDUMPER_H
+#define LLVM_DEBUGINFO_CODEVIEW_CVTYPEDUMPER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+namespace llvm {
+
+namespace codeview {
+
+/// Dumper for CodeView type streams found in COFF object files and PDB files.
+class CVTypeDumper {
+public:
+  explicit CVTypeDumper(TypeDatabase &TypeDB) : TypeDB(TypeDB) {}
+
+  /// Dumps one type record.  Returns false if there was a type parsing error,
+  /// and true otherwise.  This should be called in order, since the dumper
+  /// maintains state about previous records which are necessary for cross
+  /// type references.
+  Error dump(const CVType &Record, TypeVisitorCallbacks &Dumper);
+
+  /// Dumps the type records in Types. Returns false if there was a type stream
+  /// parse error, and true otherwise.
+  Error dump(const CVTypeArray &Types, TypeVisitorCallbacks &Dumper);
+
+  /// Dumps the type records in Data. Returns false if there was a type stream
+  /// parse error, and true otherwise. Use this method instead of the
+  /// CVTypeArray overload when type records are laid out contiguously in
+  /// memory.
+  Error dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper);
+
+  static void printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
+                             TypeIndex TI, TypeDatabase &DB);
+
+private:
+  TypeDatabase &TypeDB;
+};
+
+} // end namespace codeview
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
index eb63f7895a1e..a5419b37e776 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDumper.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
@@ -20,15 +20,15 @@ namespace llvm {
 class ScopedPrinter;
 
 namespace codeview {
-class CVTypeDumper;
+class TypeDatabase;
 
 /// Dumper for CodeView symbol streams found in COFF object files and PDB files.
 class CVSymbolDumper {
 public:
-  CVSymbolDumper(ScopedPrinter &W, CVTypeDumper &CVTD,
+  CVSymbolDumper(ScopedPrinter &W, TypeDatabase &TypeDB,
                  std::unique_ptr<SymbolDumpDelegate> ObjDelegate,
                  bool PrintRecordBytes)
-      : W(W), CVTD(CVTD), ObjDelegate(std::move(ObjDelegate)),
+      : W(W), TypeDB(TypeDB), ObjDelegate(std::move(ObjDelegate)),
         PrintRecordBytes(PrintRecordBytes) {}
 
   /// Dumps one type record.  Returns false if there was a type parsing error,
@@ -43,7 +43,7 @@ public:
 
 private:
   ScopedPrinter &W;
-  CVTypeDumper &CVTD;
+  TypeDatabase &TypeDB;
   std::unique_ptr<SymbolDumpDelegate> ObjDelegate;
 
   bool PrintRecordBytes;
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
new file mode 100644
index 000000000000..cccc2868ffb5
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
@@ -0,0 +1,55 @@
+//===- TypeDatabase.h - A collection of CodeView type records ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/StringSaver.h"
+
+namespace llvm {
+namespace codeview {
+class TypeDatabase {
+public:
+  TypeDatabase() : TypeNameStorage(Allocator) {}
+
+  /// Gets the type index for the next type record.
+  TypeIndex getNextTypeIndex() const;
+
+  /// Records the name of a type, and reserves its type index.
+  void recordType(StringRef Name, CVType Data);
+
+  /// Saves the name in a StringSet and creates a stable StringRef.
+  StringRef saveTypeName(StringRef TypeName);
+
+  StringRef getTypeName(TypeIndex Index) const;
+
+  bool containsTypeIndex(TypeIndex Index) const;
+
+  uint32_t size() const;
+
+private:
+  BumpPtrAllocator Allocator;
+
+  /// All user defined type records in .debug$T live in here. Type indices
+  /// greater than 0x1000 are user defined. Subtract 0x1000 from the index to
+  /// index into this vector.
+  SmallVector<StringRef, 10> CVUDTNames;
+  SmallVector<CVType, 10> TypeRecords;
+
+  StringSaver TypeNameStorage;
+};
+}
+}
+
+#endif
+\ No newline at end of file
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
new file mode 100644
index 000000000000..39d234cf9814
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
@@ -0,0 +1,53 @@
+//===-- TypeDatabaseVisitor.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+namespace llvm {
+namespace codeview {
+
+/// Dumper for CodeView type streams found in COFF object files and PDB files.
+class TypeDatabaseVisitor : public TypeVisitorCallbacks {
+public:
+  explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(TypeDB) {}
+
+  /// Paired begin/end actions for all types. Receives all record data,
+  /// including the fixed-length record prefix.
+  Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeEnd(CVType &Record) override;
+  Error visitMemberBegin(CVMemberRecord &Record) override;
+  Error visitMemberEnd(CVMemberRecord &Record) override;
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override;
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "TypeRecords.def"
+
+private:
+  bool IsInFieldList = false;
+
+  /// Name of the current type. Only valid before visitTypeEnd.
+  StringRef Name;
+
+  TypeDatabase &TypeDB;
+};
+
+} // end namespace codeview
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
new file mode 100644
index 000000000000..a466e4298158
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -0,0 +1,67 @@
+//===-- TypeDumpVisitor.h - CodeView type info dumper -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPVISITOR_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPVISITOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+namespace llvm {
+class ScopedPrinter;
+
+namespace codeview {
+
+/// Dumper for CodeView type streams found in COFF object files and PDB files.
+class TypeDumpVisitor : public TypeVisitorCallbacks {
+public:
+  TypeDumpVisitor(TypeDatabase &TypeDB, ScopedPrinter *W, bool PrintRecordBytes)
+      : W(W), PrintRecordBytes(PrintRecordBytes), TypeDB(TypeDB) {}
+
+  void printTypeIndex(StringRef FieldName, TypeIndex TI) const;
+
+  /// Action to take on unknown types. By default, they are ignored.
+  Error visitUnknownType(CVType &Record) override;
+  Error visitUnknownMember(CVMemberRecord &Record) override;
+
+  /// Paired begin/end actions for all types. Receives all record data,
+  /// including the fixed-length record prefix.
+  Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeEnd(CVType &Record) override;
+  Error visitMemberBegin(CVMemberRecord &Record) override;
+  Error visitMemberEnd(CVMemberRecord &Record) override;
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override;
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "TypeRecords.def"
+
+private:
+  void printMemberAttributes(MemberAttributes Attrs);
+  void printMemberAttributes(MemberAccess Access, MethodKind Kind,
+                             MethodOptions Options);
+
+  ScopedPrinter *W;
+
+  bool PrintRecordBytes = false;
+
+  TypeDatabase &TypeDB;
+};
+
+} // end namespace codeview
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumper.h b/include/llvm/DebugInfo/CodeView/TypeDumper.h
deleted file mode 100644
index 5a8b555cec02..000000000000
--- a/include/llvm/DebugInfo/CodeView/TypeDumper.h
+++ /dev/null
@@ -1,108 +0,0 @@
-//===-- TypeDumper.h - CodeView type info dumper ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
-#define LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-
-namespace llvm {
-class ScopedPrinter;
-
-namespace codeview {
-
-/// Dumper for CodeView type streams found in COFF object files and PDB files.
-class CVTypeDumper : public TypeVisitorCallbacks {
-public:
-  CVTypeDumper(ScopedPrinter *W, bool PrintRecordBytes)
-      : W(W), PrintRecordBytes(PrintRecordBytes) {}
-
-  StringRef getTypeName(TypeIndex TI);
-  void printTypeIndex(StringRef FieldName, TypeIndex TI);
-
-  /// Dumps one type record.  Returns false if there was a type parsing error,
-  /// and true otherwise.  This should be called in order, since the dumper
-  /// maintains state about previous records which are necessary for cross
-  /// type references.
-  Error dump(const CVRecord<TypeLeafKind> &Record);
-
-  /// Dumps the type records in Types. Returns false if there was a type stream
-  /// parse error, and true otherwise.
-  Error dump(const CVTypeArray &Types);
-
-  /// Dumps the type records in Data. Returns false if there was a type stream
-  /// parse error, and true otherwise. Use this method instead of the
-  /// CVTypeArray overload when type records are laid out contiguously in
-  /// memory.
-  Error dump(ArrayRef<uint8_t> Data);
-
-  /// Gets the type index for the next type record.
-  unsigned getNextTypeIndex() const {
-    return 0x1000 + CVUDTNames.size();
-  }
-
-  /// Records the name of a type, and reserves its type index.
-  void recordType(StringRef Name) { CVUDTNames.push_back(Name); }
-
-  /// Saves the name in a StringSet and creates a stable StringRef.
-  StringRef saveName(StringRef TypeName) {
-    return TypeNames.insert(TypeName).first->getKey();
-  }
-
-  void setPrinter(ScopedPrinter *P);
-  ScopedPrinter *getPrinter() { return W; }
-
-  /// Action to take on unknown types. By default, they are ignored.
-  Error visitUnknownType(CVType &Record) override;
-  Error visitUnknownMember(CVMemberRecord &Record) override;
-
-  /// Paired begin/end actions for all types. Receives all record data,
-  /// including the fixed-length record prefix.
-  Error visitTypeBegin(CVType &Record) override;
-  Error visitTypeEnd(CVType &Record) override;
-  Error visitMemberBegin(CVMemberRecord &Record) override;
-  Error visitMemberEnd(CVMemberRecord &Record) override;
-
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override;
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "TypeRecords.def"
-
-private:
-  void printMemberAttributes(MemberAttributes Attrs);
-  void printMemberAttributes(MemberAccess Access, MethodKind Kind,
-                             MethodOptions Options);
-
-  ScopedPrinter *W;
-
-  bool IsInFieldList = false;
-  bool PrintRecordBytes = false;
-
-  /// Name of the current type. Only valid before visitTypeEnd.
-  StringRef Name;
-
-  /// All user defined type records in .debug$T live in here. Type indices
-  /// greater than 0x1000 are user defined. Subtract 0x1000 from the index to
-  /// index into this vector.
-  SmallVector<StringRef, 10> CVUDTNames;
-
-  StringSet<> TypeNames;
-};
-
-} // end namespace codeview
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumperBase.h b/include/llvm/DebugInfo/CodeView/TypeDumperBase.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeDumperBase.h
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index 778817f57bf5..db9bd506be89 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -23,21 +23,32 @@ class raw_ostream;
 class DWARFAbbreviationDeclaration {
 public:
   struct AttributeSpec {
-    AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<uint8_t> S)
-        : Attr(A), Form(F), ByteSize(S) {}
+    AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<int64_t> V)
+        : Attr(A), Form(F), ByteSizeOrValue(V) {}
     dwarf::Attribute Attr;
     dwarf::Form Form;
-    /// If ByteSize has a value, then it contains the fixed size in bytes for
-    /// the Form in this object. If ByteSize doesn't have a value, then the
-    /// byte size of Form either varies according to the DWARFUnit that it is
-    /// contained in or the value size varies and must be decoded from the
-    /// debug information in order to determine its size.
-    Optional<uint8_t> ByteSize;
+    /// The following field is used for ByteSize for non-implicit_const
+    /// attributes and as value for implicit_const ones, indicated by
+    /// Form == DW_FORM_implicit_const.
+    /// The following cases are distinguished:
+    /// * Form != DW_FORM_implicit_const and ByteSizeOrValue has a value:
+    ///     ByteSizeOrValue contains the fixed size in bytes
+    ///     for the Form in this object.
+    /// * Form != DW_FORM_implicit_const and ByteSizeOrValue is None:
+    ///     byte size of Form either varies according to the DWARFUnit
+    ///     that it is contained in or the value size varies and must be
+    ///     decoded from the debug information in order to determine its size.
+    /// * Form == DW_FORM_implicit_const:
+    ///     ByteSizeOrValue contains value for the implicit_const attribute.
+    Optional<int64_t> ByteSizeOrValue;
+    bool isImplicitConst() const {
+      return Form == dwarf::DW_FORM_implicit_const;
+    }
     /// Get the fixed byte size of this Form if possible. This function might
     /// use the DWARFUnit to calculate the size of the Form, like for
     /// DW_AT_address and DW_AT_ref_addr, so this isn't just an accessor for
     /// the ByteSize member.
-    Optional<uint8_t> getByteSize(const DWARFUnit &U) const;
+    Optional<int64_t> getByteSize(const DWARFUnit &U) const;
   };
   typedef SmallVector<AttributeSpec, 8> AttributeSpecVector;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 5a24b7c87299..e335e28b39d7 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -147,21 +147,6 @@ public:
   /// DW_AT_abstract_origin referenced DIEs.
   ///
   /// \param Attr the attribute to extract.
-  /// \param FailValue the value to return if this DIE doesn't have this
-  /// attribute.
-  /// \returns the address value of the attribute or FailValue if the
-  /// attribute doesn't exist or if the attribute's form isn't a form that
-  /// describes an address.
-  uint64_t getAttributeValueAsAddress(dwarf::Attribute Attr,
-                                      uint64_t FailValue) const;
-
-  /// Extract the specified attribute from this DIE as an address.
-  ///
-  /// Extract an attribute value from this DIE only. This call doesn't look
-  /// for the attribute value in any DW_AT_specification or
-  /// DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
   /// \returns an optional value for the attribute.
   Optional<uint64_t> getAttributeValueAsAddress(dwarf::Attribute Attr) const;
   
@@ -172,21 +157,6 @@ public:
   /// DW_AT_abstract_origin referenced DIEs.
   ///
   /// \param Attr the attribute to extract.
-  /// \param FailValue the value to return if this DIE doesn't have this
-  /// attribute.
-  /// \returns the signed integer constant value of the attribute or FailValue
-  /// if the attribute doesn't exist or if the attribute's form isn't a form
-  /// that describes a signed integer.
-  int64_t getAttributeValueAsSignedConstant(dwarf::Attribute Attr,
-                                            int64_t FailValue) const;
-
-  /// Extract the specified attribute from this DIE as a signed integer.
-  ///
-  /// Extract an attribute value from this DIE only. This call doesn't look
-  /// for the attribute value in any DW_AT_specification or
-  /// DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
   /// \returns an optional value for the attribute.
   Optional<int64_t>
   getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const;
@@ -198,21 +168,6 @@ public:
   /// DW_AT_abstract_origin referenced DIEs.
   ///
   /// \param Attr the attribute to extract.
-  /// \param FailValue the value to return if this DIE doesn't have this
-  /// attribute.
-  /// \returns the unsigned integer constant value of the attribute or FailValue
-  /// if the attribute doesn't exist or if the attribute's form isn't a form
-  /// that describes an unsigned integer.
-  uint64_t getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr,
-                                               uint64_t FailValue) const;
-  
-  /// Extract the specified attribute from this DIE as an unsigned integer.
-  ///
-  /// Extract an attribute value from this DIE only. This call doesn't look
-  /// for the attribute value in any DW_AT_specification or
-  /// DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
   /// \returns an optional value for the attribute.
   Optional<uint64_t>
   getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const;
@@ -224,21 +179,6 @@ public:
   /// DW_AT_abstract_origin referenced DIEs.
   ///
   /// \param Attr the attribute to extract.
-  /// \param FailValue the value to return if this DIE doesn't have this
-  /// attribute.
-  /// \returns the unsigned integer constant value of the attribute or FailValue
-  /// if the attribute doesn't exist or if the attribute's form isn't a form
-  /// that describes a reference.
-  uint64_t getAttributeValueAsReference(dwarf::Attribute Attr,
-                                        uint64_t FailValue) const;
-  
-  /// Extract the specified attribute from this DIE as absolute DIE Offset.
-  ///
-  /// Extract an attribute value from this DIE only. This call doesn't look
-  /// for the attribute value in any DW_AT_specification or
-  /// DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
   /// \returns an optional value for the attribute.
   Optional<uint64_t> getAttributeValueAsReference(dwarf::Attribute Attr) const;
   
@@ -249,20 +189,6 @@ public:
   /// DW_AT_abstract_origin referenced DIEs.
   ///
   /// \param Attr the attribute to extract.
-  /// \param FailValue the value to return if this DIE doesn't have this
-  /// attribute.
-  /// \returns the unsigned integer constant value of the attribute or FailValue
-  /// if the attribute doesn't exist or if the attribute's form isn't a form
-  /// that describes a section offset.
-  uint64_t getAttributeValueAsSectionOffset(dwarf::Attribute Attr,
-                                            uint64_t FailValue) const;
-  /// Extract the specified attribute from this DIE as absolute section offset.
-  ///
-  /// Extract an attribute value from this DIE only. This call doesn't look
-  /// for the attribute value in any DW_AT_specification or
-  /// DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
   /// \returns an optional value for the attribute.
   Optional<uint64_t>
   getAttributeValueAsSectionOffset(dwarf::Attribute Attr) const;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 920880cea10c..1b7659dfb04a 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -57,6 +57,9 @@ public:
   DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F), U(nullptr) {}
   dwarf::Form getForm() const { return Form; }
   void setForm(dwarf::Form F) { Form = F; }
+  void setUValue(uint64_t V) { Value.uval = V; }
+  void setSValue(int64_t V) { Value.sval = V; }
+  void setPValue(const char *V) { Value.cstr = V; }
   bool isFormClass(FormClass FC) const;
   const DWARFUnit *getUnit() const { return U; }
   void dump(raw_ostream &OS) const;
diff --git a/include/llvm/DebugInfo/MSF/StreamArray.h b/include/llvm/DebugInfo/MSF/StreamArray.h
index 3bba80d807f3..5dfeb8c524af 100644
--- a/include/llvm/DebugInfo/MSF/StreamArray.h
+++ b/include/llvm/DebugInfo/MSF/StreamArray.h
@@ -153,30 +153,24 @@ public:
     return ThisValue;
   }
 
-  IterType &operator+=(std::ptrdiff_t N) {
-    while (N > 0) {
-      // We are done with the current record, discard it so that we are
-      // positioned at the next record.
-      IterRef = IterRef.drop_front(ThisLen);
-      if (IterRef.getLength() == 0) {
-        // There is nothing after the current record, we must make this an end
-        // iterator.
+  IterType &operator++() {
+    // We are done with the current record, discard it so that we are
+    // positioned at the next record.
+    IterRef = IterRef.drop_front(ThisLen);
+    if (IterRef.getLength() == 0) {
+      // There is nothing after the current record, we must make this an end
+      // iterator.
+      moveToEnd();
+    } else {
+      // There is some data after the current record.
+      auto EC = Extract(IterRef, ThisLen, ThisValue);
+      if (EC) {
+        consumeError(std::move(EC));
+        markError();
+      } else if (ThisLen == 0) {
+        // An empty record? Make this an end iterator.
         moveToEnd();
-        return *this;
-      } else {
-        // There is some data after the current record.
-        auto EC = Extract(IterRef, ThisLen, ThisValue);
-        if (EC) {
-          consumeError(std::move(EC));
-          markError();
-          return *this;
-        } else if (ThisLen == 0) {
-          // An empty record? Make this an end iterator.
-          moveToEnd();
-          return *this;
-        }
       }
-      --N;
     }
     return *this;
   }
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 932ae51b39dc..48cb7fe5df6f 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -17,7 +17,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
@@ -51,6 +53,10 @@ namespace llvm {
     SmallVector<Metadata *, 4> AllSubprograms;
     SmallVector<Metadata *, 4> AllGVs;
     SmallVector<TrackingMDNodeRef, 4> AllImportedModules;
+    /// Map Macro parent (which can be DIMacroFile or nullptr) to a list of
+    /// Metadata all of type DIMacroNode.
+    /// DIMacroNode's with nullptr parent are DICompileUnit direct children.
+    MapVector<MDNode *, SetVector<Metadata *>> AllMacrosPerParent;
 
     /// Track nodes that may be unresolved.
     SmallVector<TrackingMDNodeRef, 4> UnresolvedNodes;
@@ -116,6 +122,24 @@ namespace llvm {
                        DIFile::ChecksumKind CSKind = DIFile::CSK_None,
                        StringRef Checksum = StringRef());
 
+    /// Create debugging information entry for a macro.
+    /// \param Parent     Macro parent (could be nullptr).
+    /// \param Line       Source line number where the macro is defined.
+    /// \param MacroType  DW_MACINFO_define or DW_MACINFO_undef.
+    /// \param Name       Macro name.
+    /// \param Value      Macro value.
+    DIMacro *createMacro(DIMacroFile *Parent, unsigned Line, unsigned MacroType,
+                         StringRef Name, StringRef Value = StringRef());
+
+    /// Create debugging information temporary entry for a macro file.
+    /// List of macro node direct children will be calculated by DIBuilder,
+    /// using the \p Parent relationship.
+    /// \param Parent     Macro file parent (could be nullptr).
+    /// \param Line       Source line number where the macro file is included.
+    /// \param File       File descriptor containing the name of the macro file.
+    DIMacroFile *createTempMacroFile(DIMacroFile *Parent, unsigned Line,
+                                     DIFile *File);
+
     /// Create a single enumerator value.
     DIEnumerator *createEnumerator(StringRef Name, int64_t Val);
 
@@ -447,6 +471,9 @@ namespace llvm {
     /// Get a DINodeArray, create one if required.
     DINodeArray getOrCreateArray(ArrayRef<Metadata *> Elements);
 
+    /// Get a DIMacroNodeArray, create one if required.
+    DIMacroNodeArray getOrCreateMacroArray(ArrayRef<Metadata *> Elements);
+
     /// Get a DITypeRefArray, create one if required.
     DITypeRefArray getOrCreateTypeArray(ArrayRef<Metadata *> Elements);
 
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 26f4626ead10..187855225c50 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1295,16 +1295,12 @@ public:
   /// Check \c this can be discriminated from \c RHS in a linetable entry.
   /// Scope and inlined-at chains are not recorded in the linetable, so they
   /// cannot be used to distinguish basic blocks.
-  ///
-  /// The current implementation is weaker than it should be, since it just
-  /// checks filename and line.
-  ///
-  /// FIXME: Add a check for getDiscriminator().
-  /// FIXME: Add a check for getColumn().
-  /// FIXME: Change the getFilename() check to getFile() (or add one for
-  /// getDirectory()).
   bool canDiscriminate(const DILocation &RHS) const {
-    return getFilename() != RHS.getFilename() || getLine() != RHS.getLine();
+    return getLine() != RHS.getLine() ||
+           getColumn() != RHS.getColumn() ||
+           getDiscriminator() != RHS.getDiscriminator() ||
+           getFilename() != RHS.getFilename() ||
+           getDirectory() != RHS.getDirectory();
   }
 
   /// Get the DWARF discriminator.
@@ -1327,10 +1323,13 @@ public:
   /// represented in a single line entry.  In this case, no location
   /// should be set.
   ///
-  /// Currently this function is simply a stub, and no location will be
-  /// used for all cases.
-  static DILocation *getMergedLocation(const DILocation *LocA,
-                                       const DILocation *LocB) {
+  /// Currently the function does not create a new location. If the locations
+  /// are the same, or cannot be discriminated, the first location is returned.
+  /// Otherwise an empty location will be used.
+  static const DILocation *getMergedLocation(const DILocation *LocA,
+                                             const DILocation *LocB) {
+    if (LocA && LocB && (LocA == LocB || !LocA->canDiscriminate(*LocB)))
+      return LocA;
     return nullptr;
   }
 
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 11eb713a4e1d..1057f564aab3 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -37,11 +37,11 @@ protected:
     setGlobalValueSubClassData(0);
   }
 
-  std::string Section;     // Section to emit this into, empty means default
   Comdat *ObjComdat;
   enum {
     LastAlignmentBit = 4,
     HasMetadataHashEntryBit,
+    HasSectionHashEntryBit,
 
     GlobalObjectBits,
   };
@@ -66,8 +66,26 @@ public:
   unsigned getGlobalObjectSubClassData() const;
   void setGlobalObjectSubClassData(unsigned Val);
 
-  bool hasSection() const { return !getSection().empty(); }
-  StringRef getSection() const { return Section; }
+  /// Check if this global has a custom object file section.
+  ///
+  /// This is more efficient than calling getSection() and checking for an empty
+  /// string.
+  bool hasSection() const {
+    return getGlobalValueSubClassData() & (1 << HasSectionHashEntryBit);
+  }
+
+  /// Get the custom section of this global if it has one.
+  ///
+  /// If this global does not have a custom section, this will be empty and the
+  /// default object file section (.text, .data, etc) will be used.
+  StringRef getSection() const {
+    return hasSection() ? getSectionImpl() : StringRef();
+  }
+
+  /// Change the section for this global.
+  ///
+  /// Setting the section to the empty string tells LLVM to choose an
+  /// appropriate default object file section.
   void setSection(StringRef S);
 
   bool hasComdat() const { return getComdat() != nullptr; }
@@ -134,14 +152,20 @@ public:
   void clearMetadata();
 
 private:
+  void setGlobalObjectFlag(unsigned Bit, bool Val) {
+    unsigned Mask = 1 << Bit;
+    setGlobalValueSubClassData((~Mask & getGlobalValueSubClassData()) |
+                               (Val ? Mask : 0u));
+  }
+
   bool hasMetadataHashEntry() const {
     return getGlobalValueSubClassData() & (1 << HasMetadataHashEntryBit);
   }
   void setHasMetadataHashEntry(bool HasEntry) {
-    unsigned Mask = 1 << HasMetadataHashEntryBit;
-    setGlobalValueSubClassData((~Mask & getGlobalValueSubClassData()) |
-                               (HasEntry ? Mask : 0u));
+    setGlobalObjectFlag(HasMetadataHashEntryBit, HasEntry);
   }
+
+  StringRef getSectionImpl() const;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 33fd17f20dbe..89ae94270888 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -578,8 +578,8 @@ def int_invariant_end   : Intrinsic<[],
                                      llvm_anyptr_ty],
                                     [IntrArgMemOnly, NoCapture<2>]>;
 
-def int_invariant_group_barrier : Intrinsic<[llvm_ptr_ty], 
-                                            [llvm_ptr_ty], 
+def int_invariant_group_barrier : Intrinsic<[llvm_ptr_ty],
+                                            [llvm_ptr_ty],
                                             [IntrNoMem]>;
 
 //===------------------------ Stackmap Intrinsics -------------------------===//
@@ -683,29 +683,6 @@ def int_convert_to_fp16   : Intrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
 def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 }
 
-// These convert intrinsics are to support various conversions between
-// various types with rounding and saturation. NOTE: avoid using these
-// intrinsics as they might be removed sometime in the future and
-// most targets don't support them.
-def int_convertff  : Intrinsic<[llvm_anyfloat_ty],
-                               [llvm_anyfloat_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertfsi : Intrinsic<[llvm_anyfloat_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertfui : Intrinsic<[llvm_anyfloat_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertsif : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyfloat_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertuif : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyfloat_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertss  : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertsu  : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertus  : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-def int_convertuu  : Intrinsic<[llvm_anyint_ty],
-                               [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
-
 // Clear cache intrinsic, default to ignore (ie. emit nothing)
 // maps to void __clear_cache() on supporting platforms
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index d1e331775b7b..2c45d148e34b 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -38,12 +38,6 @@ def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
 def int_aarch64_hint : Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
-// RBIT
-
-def int_aarch64_rbit : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
-                                 [IntrNoMem]>;
-
-//===----------------------------------------------------------------------===//
 // Data Barrier Instructions
 
 def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>;
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index 099598596885..24239689a62e 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -156,11 +156,6 @@ def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>;
 def int_arm_dbg : Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
-// RBIT
-
-def int_arm_rbit : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-//===----------------------------------------------------------------------===//
 // UND (reserved undefined sequence)
 
 def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>;
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index ecb0435a1e11..83c4ae011216 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -317,10 +317,10 @@ struct TypeTestResolution {
                ///  All-Ones Bit Vectors")
   } TheKind = Unsat;
 
-  /// Range of the size expressed as a bit width. For example, if the size is in
-  /// range [0,256), this number will be 8. This helps generate the most compact
+  /// Range of size-1 expressed as a bit width. For example, if the size is in
+  /// range [1,256], this number will be 8. This helps generate the most compact
   /// instruction sequences.
-  unsigned SizeBitWidth = 0;
+  unsigned SizeM1BitWidth = 0;
 };
 
 struct TypeIdSummary {
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index aeb66633f2c8..e2880ec6fec8 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -29,7 +29,7 @@ template <> struct ScalarEnumerationTraits<TypeTestResolution::Kind> {
 template <> struct MappingTraits<TypeTestResolution> {
   static void mapping(IO &io, TypeTestResolution &res) {
     io.mapOptional("Kind", res.TheKind);
-    io.mapOptional("SizeBitWidth", res.SizeBitWidth);
+    io.mapOptional("SizeM1BitWidth", res.SizeM1BitWidth);
   }
 };
 
diff --git a/include/llvm/Object/Decompressor.h b/include/llvm/Object/Decompressor.h
new file mode 100644
index 000000000000..a11857d546aa
--- /dev/null
+++ b/include/llvm/Object/Decompressor.h
@@ -0,0 +1,64 @@
+//===-- Decompressor.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===/
+
+#ifndef LLVM_OBJECT_DECOMPRESSOR_H
+#define LLVM_OBJECT_DECOMPRESSOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Object/ObjectFile.h"
+
+namespace llvm {
+namespace object {
+
+/// @brief Decompressor helps to handle decompression of compressed sections.
+class Decompressor {
+public:
+  /// @brief Create decompressor object.
+  /// @param Name        Section name.
+  /// @param Data        Section content.
+  /// @param IsLE        Flag determines if Data is in little endian form.
+  /// @param Is64Bit     Flag determines if object is 64 bit.
+  static Expected<Decompressor> create(StringRef Name, StringRef Data,
+                                       bool IsLE, bool Is64Bit);
+
+  /// @brief Resize the buffer and uncompress section data into it.
+  /// @param Out         Destination buffer.
+  Error decompress(SmallString<32> &Out);
+
+  /// @brief Uncompress section data to raw buffer provided.
+  /// @param Buffer      Destination buffer.
+  Error decompress(MutableArrayRef<char> Buffer);
+
+  /// @brief Return memory buffer size required for decompression.
+  uint64_t getDecompressedSize() { return DecompressedSize; }
+
+  /// @brief Return true if section is compressed, including gnu-styled case.
+  static bool isCompressed(const object::SectionRef &Section);
+
+  /// @brief Return true if section is a ELF compressed one.
+  static bool isCompressedELFSection(uint64_t Flags, StringRef Name);
+
+  /// @brief Return true if section name matches gnu style compressed one.
+  static bool isGnuStyle(StringRef Name);
+
+private:
+  Decompressor(StringRef Data);
+
+  Error consumeCompressedGnuHeader();
+  Error consumeCompressedZLibHeader(bool Is64Bit, bool IsLittleEndian);
+
+  StringRef SectionData;
+  uint64_t DecompressedSize;
+};
+
+} // end namespace object
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_DECOMPRESSOR_H
diff --git a/include/llvm/ObjectYAML/DWARFYAML.h b/include/llvm/ObjectYAML/DWARFYAML.h
index 222cad61a992..d031b5ac404c 100644
--- a/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/include/llvm/ObjectYAML/DWARFYAML.h
@@ -85,6 +85,41 @@ struct Unit {
   std::vector<Entry> Entries;
 };
 
+struct File {
+  StringRef Name;
+  uint64_t DirIdx;
+  uint64_t ModTime;
+  uint64_t Length;
+};
+
+struct LineTableOpcode {
+  dwarf::LineNumberOps Opcode;
+  uint64_t ExtLen;
+  dwarf::LineNumberExtendedOps SubOpcode;
+  uint64_t Data;
+  int64_t SData;
+  File FileEntry;
+  std::vector<llvm::yaml::Hex8> UnknownOpcodeData;
+  std::vector<llvm::yaml::Hex64> StandardOpcodeData;
+};
+
+struct LineTable {
+  uint32_t TotalLength;
+  uint64_t TotalLength64;
+  uint16_t Version;
+  uint64_t PrologueLength;
+  uint8_t MinInstLength;
+  uint8_t MaxOpsPerInst;
+  uint8_t DefaultIsStmt;
+  uint8_t LineBase;
+  uint8_t LineRange;
+  uint8_t OpcodeBase;
+  std::vector<uint8_t> StandardOpcodeLengths;
+  std::vector<StringRef> IncludeDirs;
+  std::vector<File> Files;
+  std::vector<LineTableOpcode> Opcodes;
+};
+
 struct Data {
   bool IsLittleEndian;
   std::vector<Abbrev> AbbrevDecls;
@@ -98,6 +133,8 @@ struct Data {
   
   std::vector<Unit> CompileUnits;
 
+  std::vector<LineTable> DebugLines;
+
   bool isEmpty() const;
 };
 
@@ -105,6 +142,7 @@ struct Data {
 } // namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::Hex64)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::StringRef)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::Hex8)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::AttributeAbbrev)
@@ -115,6 +153,9 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::PubEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::Unit)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::FormValue)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::Entry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::File)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LineTable)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LineTableOpcode)
 
 namespace llvm {
 namespace yaml {
@@ -159,6 +200,18 @@ template <> struct MappingTraits<DWARFYAML::FormValue> {
   static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
 };
 
+template <> struct MappingTraits<DWARFYAML::File> {
+  static void mapping(IO &IO, DWARFYAML::File &File);
+};
+  
+template <> struct MappingTraits<DWARFYAML::LineTableOpcode> {
+  static void mapping(IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode);
+};
+
+template <> struct MappingTraits<DWARFYAML::LineTable> {
+  static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
+};
+
 #define HANDLE_DW_TAG(unused, name)                                            \
   io.enumCase(value, "DW_TAG_" #name, dwarf::DW_TAG_##name);
 
@@ -169,6 +222,26 @@ template <> struct ScalarEnumerationTraits<dwarf::Tag> {
   }
 };
 
+#define HANDLE_DW_LNS(unused, name)                                            \
+  io.enumCase(value, "DW_LNS_" #name, dwarf::DW_LNS_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::LineNumberOps> {
+  static void enumeration(IO &io, dwarf::LineNumberOps &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex8>(value);
+  }
+};
+
+#define HANDLE_DW_LNE(unused, name)                                            \
+  io.enumCase(value, "DW_LNE_" #name, dwarf::DW_LNE_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::LineNumberExtendedOps> {
+  static void enumeration(IO &io, dwarf::LineNumberExtendedOps &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
 #define HANDLE_DW_AT(unused, name)                                             \
   io.enumCase(value, "DW_AT_" #name, dwarf::DW_AT_##name);
 
diff --git a/include/llvm/ObjectYAML/MachOYAML.h b/include/llvm/ObjectYAML/MachOYAML.h
index 6b7a924f5143..9ec32d265bca 100644
--- a/include/llvm/ObjectYAML/MachOYAML.h
+++ b/include/llvm/ObjectYAML/MachOYAML.h
@@ -139,7 +139,6 @@ struct UniversalBinary {
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::LoadCommand)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::Section)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::Hex64)
 LLVM_YAML_IS_SEQUENCE_VECTOR(int64_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::RebaseOpcode)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::BindOpcode)
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index ba3238c86044..d76c13984d11 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -18,8 +18,8 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
 
 namespace llvm {
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 094f3af005d3..c7e558efa3dc 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -230,6 +230,15 @@ class InstrProfSymtab;
 /// bytes. This method decodes the string and populates the \c Symtab.
 Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab);
 
+/// Check if INSTR_PROF_RAW_VERSION_VAR is defined. This global is only being
+/// set in IR PGO compilation.
+bool isIRPGOFlagSet(const Module *M);
+
+/// Check if we can safely rename this Comdat function. Instances of the same
+/// comdat function may have different control flows thus can not share the
+/// same counter variable.
+bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken = false);
+
 enum InstrProfValueKind : uint32_t {
 #define VALUE_PROF_KIND(Enumerator, Value) Enumerator = Value,
 #include "llvm/ProfileData/InstrProfData.inc"
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 204672f88dd9..8d4ac81d2942 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -201,7 +201,7 @@ public:
 
   void reset();
 
-  operator bool() const;
+  explicit operator bool() const;
 
   StringRef getName() const { return Name; }
   StringRef getDescription() const { return Description; }
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 1a984037da09..8336b9df9df0 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -207,7 +207,7 @@ enum DiscriminantList {
 };
 
 /// Line Number Standard Opcode Encodings.
-enum LineNumberOps {
+enum LineNumberOps : uint8_t {
 #define HANDLE_DW_LNS(ID, NAME) DW_LNS_##NAME = ID,
 #include "llvm/Support/Dwarf.def"
 };
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 3bcf64a8a08b..2c054c75374b 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -78,11 +78,12 @@ private:
   FileOutputBuffer &operator=(const FileOutputBuffer &) = delete;
 
   FileOutputBuffer(std::unique_ptr<llvm::sys::fs::mapped_file_region> R,
-                   StringRef Path, StringRef TempPath);
+                   StringRef Path, StringRef TempPath, bool IsRegular);
 
   std::unique_ptr<llvm::sys::fs::mapped_file_region> Region;
   SmallString<128>    FinalPath;
   SmallString<128>    TempPath;
+  bool IsRegular;
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 07a53438085a..6e6ee4001644 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -571,9 +571,15 @@ public:
   // API to update (Post)DominatorTree information based on modifications to
   // the CFG...
 
-  /// addNewBlock - Add a new node to the dominator tree information.  This
-  /// creates a new node as a child of DomBB dominator node,linking it into
-  /// the children list of the immediate dominator.
+  /// Add a new node to the dominator tree information.
+  ///
+  /// This creates a new node as a child of DomBB dominator node, linking it
+  /// into the children list of the immediate dominator.
+  ///
+  /// \param BB New node in CFG.
+  /// \param DomBB CFG node that is dominator for BB.
+  /// \returns New dominator tree node that represents new CFG node.
+  ///
   DomTreeNodeBase<NodeT> *addNewBlock(NodeT *BB, NodeT *DomBB) {
     assert(getNode(BB) == nullptr && "Block already in dominator tree!");
     DomTreeNodeBase<NodeT> *IDomNode = getNode(DomBB);
@@ -583,6 +589,31 @@ public:
                 llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode))).get();
   }
 
+  /// Add a new node to the forward dominator tree and make it a new root.
+  ///
+  /// \param BB New node in CFG.
+  /// \returns New dominator tree node that represents new CFG node.
+  ///
+  DomTreeNodeBase<NodeT> *setNewRoot(NodeT *BB) {
+    assert(getNode(BB) == nullptr && "Block already in dominator tree!");
+    assert(!this->isPostDominator() &&
+           "Cannot change root of post-dominator tree");
+    DFSInfoValid = false;
+    auto &Roots = DominatorBase<NodeT>::Roots;
+    DomTreeNodeBase<NodeT> *NewNode = (DomTreeNodes[BB] =
+      llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, nullptr)).get();
+    if (Roots.empty()) {
+      addRoot(BB);
+    } else {
+      assert(Roots.size() == 1);
+      NodeT *OldRoot = Roots.front();
+      DomTreeNodes[OldRoot] =
+        NewNode->addChild(std::move(DomTreeNodes[OldRoot]));
+      Roots[0] = BB;
+    }
+    return RootNode = NewNode;
+  }
+
   /// changeImmediateDominator - This method is used to update the dominator
   /// tree information when a node's immediate dominator changes.
   ///
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index fb43ef19a645..3728a7a8cb17 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -23,66 +23,80 @@
 #ifndef LLVM_TARGET_TARGETLOWERING_H
 #define LLVM_TARGET_TARGETLOWERING_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
 #include <climits>
+#include <cstdint>
+#include <iterator>
 #include <map>
+#include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
-  class BranchProbability;
-  class CallInst;
-  class CCState;
-  class CCValAssign;
-  class FastISel;
-  class FunctionLoweringInfo;
-  class ImmutableCallSite;
-  class IntrinsicInst;
-  class MachineBasicBlock;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineJumpTableInfo;
-  class MachineLoop;
-  class MachineRegisterInfo;
-  class Mangler;
-  class MCContext;
-  class MCExpr;
-  class MCSymbol;
-  template<typename T> class SmallVectorImpl;
-  class DataLayout;
-  class TargetRegisterClass;
-  class TargetLibraryInfo;
-  class TargetLoweringObjectFile;
-  class Value;
-
-  namespace Sched {
-    enum Preference {
-      None,             // No preference
-      Source,           // Follow source order.
-      RegPressure,      // Scheduling for lowest register pressure.
-      Hybrid,           // Scheduling for both latency and register pressure.
-      ILP,              // Scheduling for ILP in low register pressure mode.
-      VLIW              // Scheduling for VLIW targets.
-    };
-  }
+
+class BranchProbability;
+class CCState;
+class CCValAssign;
+class FastISel;
+class FunctionLoweringInfo;
+class IntrinsicInst;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineJumpTableInfo;
+class MachineLoop;
+class MachineRegisterInfo;
+class MCContext;
+class MCExpr;
+class TargetRegisterClass;
+class TargetLibraryInfo;
+class TargetRegisterInfo;
+class Value;
+
+namespace Sched {
+
+  enum Preference {
+    None,             // No preference
+    Source,           // Follow source order.
+    RegPressure,      // Scheduling for lowest register pressure.
+    Hybrid,           // Scheduling for both latency and register pressure.
+    ILP,              // Scheduling for ILP in low register pressure mode.
+    VLIW              // Scheduling for VLIW targets.
+  };
+
+} // end namespace Sched
 
 /// This base class for TargetLowering contains the SelectionDAG-independent
 /// parts that can be used from the rest of CodeGen.
 class TargetLoweringBase {
-  TargetLoweringBase(const TargetLoweringBase&) = delete;
-  void operator=(const TargetLoweringBase&) = delete;
-
 public:
   /// This enum indicates whether operations are valid for a target, and if not,
   /// what action should be used to make them valid.
@@ -166,7 +180,9 @@ public:
 
   /// NOTE: The TargetMachine owns TLOF.
   explicit TargetLoweringBase(const TargetMachine &TM);
-  virtual ~TargetLoweringBase() {}
+  TargetLoweringBase(const TargetLoweringBase&) = delete;
+  void operator=(const TargetLoweringBase&) = delete;
+  virtual ~TargetLoweringBase() = default;
 
 protected:
   /// \brief Initialize all of the actions to default values.
@@ -599,19 +615,18 @@ public:
                                   MVT &RegisterVT) const;
 
   struct IntrinsicInfo {
-    unsigned     opc;         // target opcode
-    EVT          memVT;       // memory VT
-    const Value* ptrVal;      // value representing memory location
-    int          offset;      // offset off of ptrVal
-    unsigned     size;        // the size of the memory location
-                              // (taken from memVT if zero)
-    unsigned     align;       // alignment
-    bool         vol;         // is volatile?
-    bool         readMem;     // reads memory?
-    bool         writeMem;    // writes memory?
-
-    IntrinsicInfo() : opc(0), ptrVal(nullptr), offset(0), size(0), align(1),
-                      vol(false), readMem(false), writeMem(false) {}
+    unsigned     opc = 0;          // target opcode
+    EVT          memVT;            // memory VT
+    const Value* ptrVal = nullptr; // value representing memory location
+    int          offset = 0;       // offset off of ptrVal
+    unsigned     size = 0;         // the size of the memory location
+                                   // (taken from memVT if zero)
+    unsigned     align = 1;        // alignment
+    bool         vol = false;      // is volatile?
+    bool         readMem = false;  // reads memory?
+    bool         writeMem = false; // writes memory?
+
+    IntrinsicInfo() = default;
   };
 
   /// Given an intrinsic, checks if on the target the intrinsic will need to map
@@ -823,7 +838,6 @@ public:
       getCondCodeAction(CC, VT) == Custom;
   }
 
-
   /// If the action for this operation is to promote, this method returns the
   /// ValueType to promote to.
   MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
@@ -1643,11 +1657,11 @@ public:
   /// If Scale is zero, there is no ScaleReg.  Scale of 1 indicates a reg with
   /// no scale.
   struct AddrMode {
-    GlobalValue *BaseGV;
-    int64_t      BaseOffs;
-    bool         HasBaseReg;
-    int64_t      Scale;
-    AddrMode() : BaseGV(nullptr), BaseOffs(0), HasBaseReg(false), Scale(0) {}
+    GlobalValue *BaseGV = nullptr;
+    int64_t      BaseOffs = 0;
+    bool         HasBaseReg = false;
+    int64_t      Scale = 0;
+    AddrMode() = default;
   };
 
   /// Return true if the addressing mode represented by AM is legal for this
@@ -2093,8 +2107,6 @@ protected:
 private:
   LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
 
-private:
-
   /// Targets can specify ISD nodes that they would like PerformDAGCombine
   /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this
   /// array.
@@ -2192,7 +2204,6 @@ protected:
   /// \see enableExtLdPromotion.
   bool EnableExtLdPromotion;
 
-protected:
   /// Return true if the value types that can be represented by the specified
   /// register class are all legal.
   bool isLegalRC(const TargetRegisterClass *RC) const;
@@ -2209,12 +2220,12 @@ protected:
 /// This class also defines callbacks that targets must implement to lower
 /// target-specific constructs to SelectionDAG operators.
 class TargetLowering : public TargetLoweringBase {
-  TargetLowering(const TargetLowering&) = delete;
-  void operator=(const TargetLowering&) = delete;
-
 public:
   struct DAGCombinerInfo;
 
+  TargetLowering(const TargetLowering&) = delete;
+  void operator=(const TargetLowering&) = delete;
+
   /// NOTE: The TargetMachine owns TLOF.
   explicit TargetLowering(const TargetMachine &TM);
 
@@ -2376,6 +2387,7 @@ public:
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
     bool CalledByLegalizer;
+
   public:
     SelectionDAG &DAG;
 
@@ -2542,7 +2554,7 @@ public:
     ArgListEntry() : isSExt(false), isZExt(false), isInReg(false),
       isSRet(false), isNest(false), isByVal(false), isInAlloca(false),
       isReturned(false), isSwiftSelf(false), isSwiftError(false),
-      Alignment(0) { }
+      Alignment(0) {}
 
     void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
   };
@@ -2681,7 +2693,6 @@ public:
     ArgListTy &getArgs() {
       return Args;
     }
-
   };
 
   /// This function lowers an abstract call to a function into an actual call.
@@ -3118,6 +3129,13 @@ public:
                                  EVT DataVT, SelectionDAG &DAG,
                                  bool IsCompressedMemory) const;
 
+  /// Get a pointer to vector element \p Idx located in memory for a vector of
+  /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
+  /// bounds the returned pointer is unspecified, but will be within the vector
+  /// bounds.
+  SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
+                                  SDValue Idx) const;
+
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
@@ -3169,6 +3187,6 @@ void GetReturnInfo(Type *ReturnType, AttributeSet attr,
                    SmallVectorImpl<ISD::OutputArg> &Outs,
                    const TargetLowering &TLI, const DataLayout &DL);
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_TARGET_TARGETLOWERING_H
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index f5493283eee6..b1d8f8f1e917 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -20,36 +20,26 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
-#include <cassert>
 #include <string>
 
 namespace llvm {
 
-class InstrItineraryData;
 class GlobalValue;
-class Mangler;
 class MachineFunctionInitializer;
-class MachineModuleInfo;
+class Mangler;
 class MCAsmInfo;
 class MCContext;
 class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
+class raw_pwrite_stream;
 class Target;
-class TargetLibraryInfo;
-class TargetFrameLowering;
-class TargetIRAnalysis;
 class TargetIntrinsicInfo;
-class TargetLowering;
+class TargetIRAnalysis;
+class TargetLoweringObjectFile;
 class TargetPassConfig;
-class TargetRegisterInfo;
 class TargetSubtargetInfo;
-class TargetTransformInfo;
-class formatted_raw_ostream;
-class raw_ostream;
-class raw_pwrite_stream;
-class TargetLoweringObjectFile;
 
 // The old pass manager infrastructure is hidden in a legacy namespace now.
 namespace legacy {
@@ -64,8 +54,6 @@ using legacy::PassManagerBase;
 /// interface.
 ///
 class TargetMachine {
-  TargetMachine(const TargetMachine &) = delete;
-  void operator=(const TargetMachine &) = delete;
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef DataLayoutString,
                 const Triple &TargetTriple, StringRef CPU, StringRef FS,
@@ -103,8 +91,11 @@ protected: // Can only create subclasses.
   unsigned O0WantsFastISel : 1;
 
 public:
+  const TargetOptions DefaultOptions;
   mutable TargetOptions Options;
 
+  TargetMachine(const TargetMachine &) = delete;
+  void operator=(const TargetMachine &) = delete;
   virtual ~TargetMachine();
 
   const Target &getTarget() const { return TheTarget; }
@@ -310,6 +301,6 @@ public:
                          bool DisableVerify = true) override;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_TARGET_TARGETMACHINE_H
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 4ddf7d77a23a..55e2c2bce3db 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -575,9 +575,6 @@ def intrinsic_w_chain : SDNode<"ISD::INTRINSIC_W_CHAIN",
 def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN",
                                 SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>;
 
-// Do not use cvt directly. Use cvt forms below
-def cvt : SDNode<"ISD::CONVERT_RNDSAT", SDTConvertOp>;
-
 def SDT_assertext : SDTypeProfile<1, 1,
   [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
 def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
@@ -1085,54 +1082,6 @@ def atomic_load_64 :
 }]>;
 
 //===----------------------------------------------------------------------===//
-// Selection DAG CONVERT_RNDSAT patterns
-
-def cvtff : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_FF;
-    }]>;
-
-def cvtss : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_SS;
-    }]>;
-
-def cvtsu : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_SU;
-    }]>;
-
-def cvtus : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_US;
-    }]>;
-
-def cvtuu : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_UU;
-    }]>;
-
-def cvtsf : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_SF;
-    }]>;
-
-def cvtuf : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_UF;
-    }]>;
-
-def cvtfs : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_FS;
-    }]>;
-
-def cvtfu : PatFrag<(ops node:$val, node:$dty, node:$sty, node:$rd, node:$sat),
-    (cvt node:$val, node:$dty, node:$sty, node:$rd, node:$sat), [{
-       return cast<CvtRndSatSDNode>(N)->getCvtCode() == ISD::CVT_FU;
-    }]>;
-
-//===----------------------------------------------------------------------===//
 // Selection DAG Pattern Support.
 //
 // Patterns are what are actually matched against by the target-flavored
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index bf4331383cb0..0b4351596021 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -14,23 +14,26 @@
 #ifndef LLVM_TARGET_TARGETSUBTARGETINFO_H
 #define LLVM_TARGET_TARGETSUBTARGETINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CodeGen.h"
+#include <memory>
 #include <vector>
 
 namespace llvm {
 
 class CallLowering;
-class DataLayout;
 class InstructionSelector;
 class LegalizerInfo;
-class MachineFunction;
 class MachineInstr;
 class RegisterBankInfo;
 class SDep;
+class SelectionDAGTargetInfo;
 class SUnit;
 class TargetFrameLowering;
 class TargetInstrInfo;
@@ -38,9 +41,7 @@ class TargetLowering;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetSchedModel;
-class SelectionDAGTargetInfo;
 struct MachineSchedPolicy;
-template <typename T> class SmallVectorImpl;
 
 //===----------------------------------------------------------------------===//
 ///
@@ -49,10 +50,6 @@ template <typename T> class SmallVectorImpl;
 /// be exposed through a TargetSubtargetInfo-derived class.
 ///
 class TargetSubtargetInfo : public MCSubtargetInfo {
-  TargetSubtargetInfo(const TargetSubtargetInfo &) = delete;
-  void operator=(const TargetSubtargetInfo &) = delete;
-  TargetSubtargetInfo() = delete;
-
 protected: // Can only create subclasses...
   TargetSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
                       ArrayRef<SubtargetFeatureKV> PF,
@@ -69,6 +66,9 @@ public:
   typedef enum { ANTIDEP_NONE, ANTIDEP_CRITICAL, ANTIDEP_ALL } AntiDepBreakMode;
   typedef SmallVectorImpl<const TargetRegisterClass *> RegClassVector;
 
+  TargetSubtargetInfo() = delete;
+  TargetSubtargetInfo(const TargetSubtargetInfo &) = delete;
+  void operator=(const TargetSubtargetInfo &) = delete;
   virtual ~TargetSubtargetInfo();
 
   virtual bool isXRaySupported() const { return false; }
@@ -229,6 +229,6 @@ public:
   virtual bool enableSubRegLiveness() const { return false; }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_TARGET_TARGETSUBTARGETINFO_H
diff --git a/include/llvm/Transforms/Scalar/IVUsersPrinter.h b/include/llvm/Transforms/Scalar/IVUsersPrinter.h
new file mode 100644
index 000000000000..fad00d86a95f
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/IVUsersPrinter.h
@@ -0,0 +1,30 @@
+//===- IVUsersPrinter.h - Induction Variable Users Printing -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_IVUSERSPRINTER_H
+#define LLVM_TRANSFORMS_SCALAR_IVUSERSPRINTER_H
+
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+/// Printer pass for the \c IVUsers for a loop.
+class IVUsersPrinterPass : public PassInfoMixin<IVUsersPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit IVUsersPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+}
+
+#endif
diff --git a/include/llvm/Transforms/Scalar/IndVarSimplify.h b/include/llvm/Transforms/Scalar/IndVarSimplify.h
index 24a31594b153..4a4683f1a07d 100644
--- a/include/llvm/Transforms/Scalar/IndVarSimplify.h
+++ b/include/llvm/Transforms/Scalar/IndVarSimplify.h
@@ -16,14 +16,15 @@
 #define LLVM_TRANSFORMS_SCALAR_INDVARSIMPLIFY_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 class IndVarSimplifyPass : public PassInfoMixin<IndVarSimplifyPass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 }
 
diff --git a/include/llvm/Transforms/Scalar/LICM.h b/include/llvm/Transforms/Scalar/LICM.h
index 39bbc72f8cb4..68ad190c7647 100644
--- a/include/llvm/Transforms/Scalar/LICM.h
+++ b/include/llvm/Transforms/Scalar/LICM.h
@@ -34,15 +34,16 @@
 #define LLVM_TRANSFORMS_SCALAR_LICM_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 /// Performs Loop Invariant Code Motion Pass.
 class LICMPass : public PassInfoMixin<LICMPass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h b/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
new file mode 100644
index 000000000000..5eddd5fdc7e7
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h
@@ -0,0 +1,31 @@
+//===- llvm/Analysis/LoopAccessAnalysisPrinter.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPACCESSANALYSISPRINTER_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPACCESSANALYSISPRINTER_H
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+/// \brief Printer pass for the \c LoopAccessInfo results.
+class LoopAccessInfoPrinterPass
+    : public PassInfoMixin<LoopAccessInfoPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit LoopAccessInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Scalar/LoopDeletion.h b/include/llvm/Transforms/Scalar/LoopDeletion.h
index 891f08faa48a..b44f823a82ca 100644
--- a/include/llvm/Transforms/Scalar/LoopDeletion.h
+++ b/include/llvm/Transforms/Scalar/LoopDeletion.h
@@ -15,16 +15,17 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPDELETION_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 class LoopDeletionPass : public PassInfoMixin<LoopDeletionPass> {
 public:
   LoopDeletionPass() {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
   bool runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
               LoopInfo &loopInfo);
 
diff --git a/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
index 0c052ddd2fe7..40349e8f7fe0 100644
--- a/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -17,15 +17,16 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPIDIOMRECOGNIZE_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 /// Performs Loop Idiom Recognize Pass.
 class LoopIdiomRecognizePass : public PassInfoMixin<LoopIdiomRecognizePass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Scalar/LoopInstSimplify.h b/include/llvm/Transforms/Scalar/LoopInstSimplify.h
index e30f4a97b78e..bb8bc29577a2 100644
--- a/include/llvm/Transforms/Scalar/LoopInstSimplify.h
+++ b/include/llvm/Transforms/Scalar/LoopInstSimplify.h
@@ -15,15 +15,16 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPINSTSIMPLIFY_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 /// Performs Loop Inst Simplify Pass.
 class LoopInstSimplifyPass : public PassInfoMixin<LoopInstSimplifyPass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h
new file mode 100644
index 000000000000..b0e6dd6f4c08
--- /dev/null
+++ b/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -0,0 +1,363 @@
+//===- LoopPassManager.h - Loop pass management -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header provides classes for managing a pipeline of passes over loops
+/// in LLVM IR.
+///
+/// The primary loop pass pipeline is managed in a very particular way to
+/// provide a set of core guarantees:
+/// 1) Loops are, where possible, in simplified form.
+/// 2) Loops are *always* in LCSSA form.
+/// 3) A collection of Loop-specific analysis results are available:
+///    - LoopInfo
+///    - DominatorTree
+///    - ScalarEvolution
+///    - AAManager
+/// 4) All loop passes preserve #1 (where possible), #2, and #3.
+/// 5) Loop passes run over each loop in the loop nest from the innermost to
+///    the outermost. Specifically, all inner loops are processed before
+///    passes run over outer loops. When running the pipeline across an inner
+///    loop creates new inner loops, those are added and processed in this
+///    order as well.
+///
+/// This process is designed to facilitate transformations which simplify,
+/// reduce, and remove loops. For passes which are more oriented towards
+/// optimizing loops, especially optimizing loop *nests* instead of single
+/// loops in isolation, this framework is less interesting.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPPASSMANAGER_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPPASSMANAGER_H
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+// Forward declarations of an update tracking API used in the pass manager.
+class LPMUpdater;
+
+// Explicit specialization and instantiation declarations for the pass manager.
+// See the comments on the definition of the specialization for details on how
+// it differs from the primary template.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+            LPMUpdater &>::run(Loop &InitialL, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AnalysisResults,
+                               LPMUpdater &U);
+extern template class PassManager<Loop, LoopAnalysisManager,
+                                  LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// \brief The Loop pass manager.
+///
+/// See the documentation for the PassManager template for details. It runs
+/// a sequence of Loop passes over each Loop that the manager is run over. This
+/// typedef serves as a convenient way to refer to this construct.
+typedef PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+                    LPMUpdater &>
+    LoopPassManager;
+
+/// A partial specialization of the require analysis template pass to forward
+/// the extra parameters from a transformation's run method to the
+/// AnalysisManager's getResult.
+template <typename AnalysisT>
+struct RequireAnalysisPass<AnalysisT, Loop, LoopAnalysisManager,
+                           LoopStandardAnalysisResults &, LPMUpdater &>
+    : PassInfoMixin<
+          RequireAnalysisPass<AnalysisT, Loop, LoopAnalysisManager,
+                              LoopStandardAnalysisResults &, LPMUpdater &>> {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &) {
+    (void)AM.template getResult<AnalysisT>(L, AR);
+    return PreservedAnalyses::all();
+  }
+};
+
+/// An alias template to easily name a require analysis loop pass.
+template <typename AnalysisT>
+using RequireAnalysisLoopPass =
+    RequireAnalysisPass<AnalysisT, Loop, LoopAnalysisManager,
+                        LoopStandardAnalysisResults &, LPMUpdater &>;
+
+namespace internal {
+/// Helper to implement appending of loops onto a worklist.
+///
+/// We want to process loops in postorder, but the worklist is a LIFO data
+/// structure, so we append to it in *reverse* postorder.
+///
+/// For trees, a preorder traversal is a viable reverse postorder, so we
+/// actually append using a preorder walk algorithm.
+template <typename RangeT>
+inline void appendLoopsToWorklist(RangeT &&Loops,
+                                  SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  // We use an internal worklist to build up the preorder traversal without
+  // recursion.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+  // We walk the initial sequence of loops in reverse because we generally want
+  // to visit defs before uses and the worklist is LIFO.
+  for (Loop *RootL : reverse(Loops)) {
+    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+
+    Worklist.insert(std::move(PreOrderLoops));
+    PreOrderLoops.clear();
+  }
+}
+}
+
+template <typename LoopPassT> class FunctionToLoopPassAdaptor;
+
+/// This class provides an interface for updating the loop pass manager based
+/// on mutations to the loop nest.
+///
+/// A reference to an instance of this class is passed as an argument to each
+/// Loop pass, and Loop passes should use it to update LPM infrastructure if
+/// they modify the loop nest structure.
+class LPMUpdater {
+public:
+  /// This can be queried by loop passes which run other loop passes (like pass
+  /// managers) to know whether the loop needs to be skipped due to updates to
+  /// the loop nest.
+  ///
+  /// If this returns true, the loop object may have been deleted, so passes
+  /// should take care not to touch the object.
+  bool skipCurrentLoop() const { return SkipCurrentLoop; }
+
+  /// Loop passes should use this method to indicate they have deleted a loop
+  /// from the nest.
+  ///
+  /// Note that this loop must either be the current loop or a subloop of the
+  /// current loop. This routine must be called prior to removing the loop from
+  /// the loop nest.
+  ///
+  /// If this is called for the current loop, in addition to clearing any
+  /// state, this routine will mark that the current loop should be skipped by
+  /// the rest of the pass management infrastructure.
+  void markLoopAsDeleted(Loop &L) {
+    LAM.clear(L);
+    assert(CurrentL->contains(&L) && "Cannot delete a loop outside of the "
+                                     "subloop tree currently being processed.");
+    if (&L == CurrentL)
+      SkipCurrentLoop = true;
+  }
+
+  /// Loop passes should use this method to indicate they have added new child
+  /// loops of the current loop.
+  ///
+  /// \p NewChildLoops must contain only the immediate children. Any nested
+  /// loops within them will be visited in postorder as usual for the loop pass
+  /// manager.
+  void addChildLoops(ArrayRef<Loop *> NewChildLoops) {
+    // Insert ourselves back into the worklist first, as this loop should be
+    // revisited after all the children have been processed.
+    Worklist.insert(CurrentL);
+
+#ifndef NDEBUG
+    for (Loop *NewL : NewChildLoops)
+      assert(NewL->getParentLoop() == CurrentL && "All of the new loops must "
+                                                  "be immediate children of "
+                                                  "the current loop!");
+#endif
+
+    internal::appendLoopsToWorklist(NewChildLoops, Worklist);
+
+    // Also skip further processing of the current loop--it will be revisited
+    // after all of its newly added children are accounted for.
+    SkipCurrentLoop = true;
+  }
+
+  /// Loop passes should use this method to indicate they have added new
+  /// sibling loops to the current loop.
+  ///
+  /// \p NewSibLoops must only contain the immediate sibling loops. Any nested
+  /// loops within them will be visited in postorder as usual for the loop pass
+  /// manager.
+  void addSiblingLoops(ArrayRef<Loop *> NewSibLoops) {
+#ifndef NDEBUG
+    for (Loop *NewL : NewSibLoops)
+      assert(NewL->getParentLoop() == ParentL &&
+             "All of the new loops must be siblings of the current loop!");
+#endif
+
+    internal::appendLoopsToWorklist(NewSibLoops, Worklist);
+
+    // No need to skip the current loop or revisit it, as sibling loops
+    // shouldn't impact anything.
+  }
+
+private:
+  template <typename LoopPassT> friend class llvm::FunctionToLoopPassAdaptor;
+
+  /// The \c FunctionToLoopPassAdaptor's worklist of loops to process.
+  SmallPriorityWorklist<Loop *, 4> &Worklist;
+
+  /// The analysis manager for use in the current loop nest.
+  LoopAnalysisManager &LAM;
+
+  Loop *CurrentL;
+  bool SkipCurrentLoop;
+
+#ifndef NDEBUG
+  // In debug builds we also track the parent loop to implement asserts even in
+  // the face of loop deletion.
+  Loop *ParentL;
+#endif
+
+  LPMUpdater(SmallPriorityWorklist<Loop *, 4> &Worklist,
+             LoopAnalysisManager &LAM)
+      : Worklist(Worklist), LAM(LAM) {}
+};
+
+/// \brief Adaptor that maps from a function to its loops.
+///
+/// Designed to allow composition of a LoopPass(Manager) and a
+/// FunctionPassManager. Note that if this pass is constructed with a \c
+/// FunctionAnalysisManager it will run the \c LoopAnalysisManagerFunctionProxy
+/// analysis prior to running the loop passes over the function to enable a \c
+/// LoopAnalysisManager to be used within this run safely.
+template <typename LoopPassT>
+class FunctionToLoopPassAdaptor
+    : public PassInfoMixin<FunctionToLoopPassAdaptor<LoopPassT>> {
+public:
+  explicit FunctionToLoopPassAdaptor(LoopPassT Pass) : Pass(std::move(Pass)) {}
+
+  /// \brief Runs the loop passes across every loop in the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
+    // Setup the loop analysis manager from its proxy.
+    LoopAnalysisManager &LAM =
+        AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+    // Get the loop structure for this function
+    LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
+    // If there are no loops, there is nothing to do here.
+    if (LI.empty())
+      return PreservedAnalyses::all();
+
+    // Get the analysis results needed by loop passes.
+    LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
+                                       AM.getResult<AssumptionAnalysis>(F),
+                                       AM.getResult<DominatorTreeAnalysis>(F),
+                                       AM.getResult<LoopAnalysis>(F),
+                                       AM.getResult<ScalarEvolutionAnalysis>(F),
+                                       AM.getResult<TargetLibraryAnalysis>(F),
+                                       AM.getResult<TargetIRAnalysis>(F)};
+
+    PreservedAnalyses PA = PreservedAnalyses::all();
+
+    // A postorder worklist of loops to process.
+    SmallPriorityWorklist<Loop *, 4> Worklist;
+
+    // Register the worklist and loop analysis manager so that loop passes can
+    // update them when they mutate the loop nest structure.
+    LPMUpdater Updater(Worklist, LAM);
+
+    // Add the loop nests in the reverse order of LoopInfo. For some reason,
+    // they are stored in RPO w.r.t. the control flow graph in LoopInfo. For
+    // the purpose of unrolling, loop deletion, and LICM, we largely want to
+    // work forward across the CFG so that we visit defs before uses and can
+    // propagate simplifications from one loop nest into the next.
+    // FIXME: Consider changing the order in LoopInfo.
+    internal::appendLoopsToWorklist(reverse(LI), Worklist);
+
+    do {
+      Loop *L = Worklist.pop_back_val();
+
+      // Reset the update structure for this loop.
+      Updater.CurrentL = L;
+      Updater.SkipCurrentLoop = false;
+#ifndef NDEBUG
+      Updater.ParentL = L->getParentLoop();
+#endif
+
+      PreservedAnalyses PassPA = Pass.run(*L, LAM, LAR, Updater);
+      // FIXME: We should verify the set of analyses relevant to Loop passes
+      // are preserved.
+
+      // If the loop hasn't been deleted, we need to handle invalidation here.
+      if (!Updater.skipCurrentLoop())
+        // We know that the loop pass couldn't have invalidated any other
+        // loop's analyses (that's the contract of a loop pass), so directly
+        // handle the loop analysis manager's invalidation here.
+        LAM.invalidate(*L, PassPA);
+
+      // Then intersect the preserved set so that invalidation of module
+      // analyses will eventually occur when the module pass completes.
+      PA.intersect(std::move(PassPA));
+    } while (!Worklist.empty());
+
+    // By definition we preserve the proxy. We also preserve all analyses on
+    // Loops. This precludes *any* invalidation of loop analyses by the proxy,
+    // but that's OK because we've taken care to invalidate analyses in the
+    // loop analysis manager incrementally above.
+    PA.preserveSet<AllAnalysesOn<Loop>>();
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    // We also preserve the set of standard analyses.
+    PA.preserve<AssumptionAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    // FIXME: What we really want to do here is preserve an AA category, but
+    // that concept doesn't exist yet.
+    PA.preserve<AAManager>();
+    PA.preserve<BasicAA>();
+    PA.preserve<GlobalsAA>();
+    PA.preserve<SCEVAA>();
+    return PA;
+  }
+
+private:
+  LoopPassT Pass;
+};
+
+/// \brief A function to deduce a loop pass type and wrap it in the templated
+/// adaptor.
+template <typename LoopPassT>
+FunctionToLoopPassAdaptor<LoopPassT>
+createFunctionToLoopPassAdaptor(LoopPassT Pass) {
+  return FunctionToLoopPassAdaptor<LoopPassT>(std::move(Pass));
+}
+
+/// \brief Pass for printing a loop's contents as textual IR.
+class PrintLoopPass : public PassInfoMixin<PrintLoopPass> {
+  raw_ostream &OS;
+  std::string Banner;
+
+public:
+  PrintLoopPass();
+  PrintLoopPass(raw_ostream &OS, const std::string &Banner = "");
+
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
+                        LoopStandardAnalysisResults &, LPMUpdater &);
+};
+}
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPPASSMANAGER_H
diff --git a/include/llvm/Transforms/Scalar/LoopRotation.h b/include/llvm/Transforms/Scalar/LoopRotation.h
index 54b8ec545ed2..ea8d5618e6f7 100644
--- a/include/llvm/Transforms/Scalar/LoopRotation.h
+++ b/include/llvm/Transforms/Scalar/LoopRotation.h
@@ -15,8 +15,8 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPROTATION_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
@@ -24,7 +24,8 @@ namespace llvm {
 class LoopRotatePass : public PassInfoMixin<LoopRotatePass> {
 public:
   LoopRotatePass(bool EnableHeaderDuplication = true);
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 
 private:
   const bool EnableHeaderDuplication;
diff --git a/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h b/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
index 2f06782052c5..7628c7413eac 100644
--- a/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
+++ b/include/llvm/Transforms/Scalar/LoopSimplifyCFG.h
@@ -18,15 +18,16 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPSIMPLIFYCFG_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 /// Performs basic CFG simplifications to assist other loop passes.
 class LoopSimplifyCFGPass : public PassInfoMixin<LoopSimplifyCFGPass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Scalar/LoopStrengthReduce.h b/include/llvm/Transforms/Scalar/LoopStrengthReduce.h
index 11c0d9bce85b..ebcb32125262 100644
--- a/include/llvm/Transforms/Scalar/LoopStrengthReduce.h
+++ b/include/llvm/Transforms/Scalar/LoopStrengthReduce.h
@@ -23,15 +23,16 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPSTRENGTHREDUCE_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
 /// Performs Loop Strength Reduce Pass.
 class LoopStrengthReducePass : public PassInfoMixin<LoopStrengthReducePass> {
 public:
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 74a7258df5fc..9da95ef81fad 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -11,8 +11,8 @@
 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 
@@ -23,7 +23,8 @@ struct LoopUnrollPass : public PassInfoMixin<LoopUnrollPass> {
   Optional<bool> ProvidedRuntime;
   Optional<bool> ProvidedUpperBound;
 
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // end namespace llvm
 
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 845069d4260a..27b45c4fa941 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -29,6 +29,7 @@ class DataLayout;
 class DominatorTree;
 class Loop;
 class LoopInfo;
+class OptimizationRemarkEmitter;
 class Pass;
 class PredicatedScalarEvolution;
 class PredIteratorCache;
@@ -404,11 +405,11 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
 /// uses before definitions, allowing us to sink a loop body in one pass without
 /// iteration. Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree,
 /// DataLayout, TargetLibraryInfo, Loop, AliasSet information for all
-/// instructions of the loop and loop safety information as arguments.
-/// It returns changed status.
+/// instructions of the loop and loop safety information as
+/// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                 TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                LoopSafetyInfo *);
+                LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
 
 /// \brief Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
@@ -416,10 +417,11 @@ bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
 /// before uses, allowing us to hoist a loop body in one pass without iteration.
 /// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, DataLayout,
 /// TargetLibraryInfo, Loop, AliasSet information for all instructions of the
-/// loop and loop safety information as arguments. It returns changed status.
+/// loop and loop safety information as arguments. Diagnostics is emitted via \p
+/// ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                  TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                 LoopSafetyInfo *);
+                 LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
 
 /// \brief Try to promote memory values to scalars by sinking stores out of
 /// the loop and moving loads to before the loop.  We do this by looping over
@@ -427,12 +429,14 @@ bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
 /// loop invariant. It takes AliasSet, Loop exit blocks vector, loop exit blocks
 /// insertion point vector, PredIteratorCache, LoopInfo, DominatorTree, Loop,
 /// AliasSet information for all instructions of the loop and loop safety
-/// information as arguments. It returns changed status.
+/// information as arguments. Diagnostics is emitted via \p ORE. It returns
+/// changed status.
 bool promoteLoopAccessesToScalars(AliasSet &, SmallVectorImpl<BasicBlock *> &,
                                   SmallVectorImpl<Instruction *> &,
                                   PredIteratorCache &, LoopInfo *,
                                   DominatorTree *, const TargetLibraryInfo *,
-                                  Loop *, AliasSetTracker *, LoopSafetyInfo *);
+                                  Loop *, AliasSetTracker *, LoopSafetyInfo *,
+                                  OptimizationRemarkEmitter *);
 
 /// \brief Computes safety information for a loop
 /// checks loop body & header for the possibility of may throw
@@ -478,11 +482,12 @@ void getLoopAnalysisUsage(AnalysisUsage &AU);
 /// preheader to loop body (no speculation).
 /// If SafetyInfo is not null, we are checking for hoisting/sinking
 /// instructions from loop body to preheader/exit. Check if the instruction
-/// can execute specultatively.
-///
+/// can execute speculatively.
+/// If \p ORE is set use it to emit optimization remarks.
 bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         Loop *CurLoop, AliasSetTracker *CurAST,
-                        LoopSafetyInfo *SafetyInfo);
+                        LoopSafetyInfo *SafetyInfo,
+                        OptimizationRemarkEmitter *ORE = nullptr);
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index 2ea28f2d4e13..f322bea7aa2e 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -33,6 +33,12 @@ class Pass;
 class OptimizationRemarkEmitter;
 class ScalarEvolution;
 
+typedef SmallDenseMap<const Loop *, Loop *, 4> NewLoopsMap;
+
+const Loop* addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+                                     BasicBlock *ClonedBB, LoopInfo *LI,
+                                     NewLoopsMap &NewLoops);
+
 bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                 bool AllowRuntime, bool AllowExpensiveTripCount,
                 bool PreserveCondBr, bool PreserveOnlyFirst,
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorize.h b/include/llvm/Transforms/Vectorize/LoopVectorize.h
index 2efc7ca4f8a1..73d1f264c37b 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -57,12 +57,12 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <functional>
 
 namespace llvm {
diff --git a/include/llvm/XRay/Trace.h b/include/llvm/XRay/Trace.h
new file mode 100644
index 000000000000..6b033d686b06
--- /dev/null
+++ b/include/llvm/XRay/Trace.h
@@ -0,0 +1,71 @@
+//===- Trace.h - XRay Trace Abstraction -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the XRay Trace class representing records in an XRay trace file.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_XRAY_TRACE_H
+#define LLVM_XRAY_TRACE_H
+
+#include <cstdint>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/XRay/XRayRecord.h"
+
+namespace llvm {
+namespace xray {
+
+/// A Trace object represents the records that have been loaded from XRay
+/// log files generated by instrumented binaries. We encapsulate the logic of
+/// reading the traces in factory functions that populate the Trace object
+/// appropriately.
+///
+/// Trace objects provide an accessor to an XRayFileHeader which says more about
+/// details of the file from which the XRay trace was loaded from.
+///
+/// Usage:
+///
+///   if (auto TraceOrErr = loadTraceFile("xray-log.something.xray")) {
+///     auto& T = *TraceOrErr;
+///     // T.getFileHeader() will provide information from the trace header.
+///     for (const XRayRecord &R : T) {
+///       // ... do something with R here.
+///     }
+///   } else {
+///     // Handle the error here.
+///   }
+///
+class Trace {
+  XRayFileHeader FileHeader;
+  std::vector<XRayRecord> Records;
+
+  typedef std::vector<XRayRecord>::const_iterator citerator;
+
+  friend Expected<Trace> loadTraceFile(StringRef, bool);
+
+public:
+  /// Provides access to the loaded XRay trace file header.
+  const XRayFileHeader &getFileHeader() const { return FileHeader; }
+
+  citerator begin() const { return Records.begin(); }
+  citerator end() const { return Records.end(); }
+  size_t size() const { return Records.size(); }
+};
+
+/// This function will attempt to load XRay trace records from the provided
+/// |Filename|.
+Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
+
+} // namespace xray
+} // namespace llvm
+
+#endif // LLVM_XRAY_TRACE_H
diff --git a/include/llvm/XRay/XRayRecord.h b/include/llvm/XRay/XRayRecord.h
new file mode 100644
index 000000000000..a96846136ec3
--- /dev/null
+++ b/include/llvm/XRay/XRayRecord.h
@@ -0,0 +1,76 @@
+//===- XRayRecord.h - XRay Trace Record -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file replicates the record definition for XRay log entries. This should
+// follow the evolution of the log record versions supported in the compiler-rt
+// xray project.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_XRAY_XRAY_RECORD_H
+#define LLVM_XRAY_XRAY_RECORD_H
+
+#include <cstdint>
+
+namespace llvm {
+namespace xray {
+
+/// XRay traces all have a header providing some top-matter information useful
+/// to help tools determine how to interpret the information available in the
+/// trace.
+struct XRayFileHeader {
+  /// Version of the XRay implementation that produced this file.
+  uint16_t Version = 0;
+
+  /// A numeric identifier for the type of file this is. Best used in
+  /// combination with Version.
+  uint16_t Type = 0;
+
+  /// Whether the CPU that produced the timestamp counters (TSC) move at a
+  /// constant rate.
+  bool ConstantTSC;
+
+  /// Whether the CPU that produced the timestamp counters (TSC) do not stop.
+  bool NonstopTSC;
+
+  /// The number of cycles per second for the CPU that produced the timestamp
+  /// counter (TSC) values. Useful for estimating the amount of time that
+  /// elapsed between two TSCs on some platforms.
+  uint64_t CycleFrequency = 0;
+};
+
+/// Determines the supported types of records that could be seen in XRay traces.
+/// This may or may not correspond to actual record types in the raw trace (as
+/// the loader implementation may synthesize this information in the process of
+/// of loading).
+enum class RecordTypes { ENTER, EXIT };
+
+struct XRayRecord {
+  /// The type of record.
+  uint16_t RecordType;
+
+  /// The CPU where the thread is running. We assume number of CPUs <= 256.
+  uint8_t CPU;
+
+  /// Identifies the type of record.
+  RecordTypes Type;
+
+  /// The function ID for the record.
+  int32_t FuncId;
+
+  /// Get the full 8 bytes of the TSC when we get the log record.
+  uint64_t TSC;
+
+  /// The thread ID for the currently running thread.
+  uint32_t TId;
+};
+
+} // namespace xray
+} // namespace llvm
+
+#endif // LLVM_XRAY_XRAY_RECORD_H
diff --git a/include/llvm/XRay/YAMLXRayRecord.h b/include/llvm/XRay/YAMLXRayRecord.h
new file mode 100644
index 000000000000..f5836b392242
--- /dev/null
+++ b/include/llvm/XRay/YAMLXRayRecord.h
@@ -0,0 +1,99 @@
+//===- YAMLXRayRecord.h - XRay Record YAML Support Definitions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Types and traits specialisations for YAML I/O of XRay log entries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_XRAY_YAML_XRAY_RECORD_H
+#define LLVM_XRAY_YAML_XRAY_RECORD_H
+
+#include <type_traits>
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/XRay/XRayRecord.h"
+
+namespace llvm {
+namespace xray {
+
+struct YAMLXRayFileHeader {
+  uint16_t Version;
+  uint16_t Type;
+  bool ConstantTSC;
+  bool NonstopTSC;
+  uint64_t CycleFrequency;
+};
+
+struct YAMLXRayRecord {
+  uint16_t RecordType;
+  uint8_t CPU;
+  RecordTypes Type;
+  int32_t FuncId;
+  std::string Function;
+  uint64_t TSC;
+  uint32_t TId;
+};
+
+struct YAMLXRayTrace {
+  YAMLXRayFileHeader Header;
+  std::vector<YAMLXRayRecord> Records;
+};
+
+} // namespace xray
+
+namespace yaml {
+
+// YAML Traits
+// -----------
+template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+  static void enumeration(IO &IO, xray::RecordTypes &Type) {
+    IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
+    IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
+  }
+};
+
+template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+  static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
+    IO.mapRequired("version", Header.Version);
+    IO.mapRequired("type", Header.Type);
+    IO.mapRequired("constant-tsc", Header.ConstantTSC);
+    IO.mapRequired("nonstop-tsc", Header.NonstopTSC);
+    IO.mapRequired("cycle-frequency", Header.CycleFrequency);
+  }
+};
+
+template <> struct MappingTraits<xray::YAMLXRayRecord> {
+  static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
+    // FIXME: Make this type actually be descriptive
+    IO.mapRequired("type", Record.RecordType);
+    IO.mapRequired("func-id", Record.FuncId);
+    IO.mapOptional("function", Record.Function);
+    IO.mapRequired("cpu", Record.CPU);
+    IO.mapRequired("thread", Record.TId);
+    IO.mapRequired("kind", Record.Type);
+    IO.mapRequired("tsc", Record.TSC);
+  }
+
+  static constexpr bool flow = true;
+};
+
+template <> struct MappingTraits<xray::YAMLXRayTrace> {
+  static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
+    // A trace file contains two parts, the header and the list of all the
+    // trace records.
+    IO.mapRequired("header", Trace.Header);
+    IO.mapRequired("records", Trace.Records);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord)
+
+#endif // LLVM_XRAY_YAML_XRAY_RECORD_H
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index 3c518034ba62..aa55d79b761e 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -24,6 +24,109 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+SmallVector<WeakVH, 1> &AssumptionCache::getAffectedValues(Value *V) {
+  // Try using find_as first to avoid creating extra value handles just for the
+  // purpose of doing the lookup.
+  auto AVI = AffectedValues.find_as(V);
+  if (AVI != AffectedValues.end())
+    return AVI->second;
+
+  auto AVIP = AffectedValues.insert({
+      AffectedValueCallbackVH(V, this), SmallVector<WeakVH, 1>()});
+  return AVIP.first->second;
+}
+
+void AssumptionCache::updateAffectedValues(CallInst *CI) {
+  // Note: This code must be kept in-sync with the code in
+  // computeKnownBitsFromAssume in ValueTracking.
+
+  SmallVector<Value *, 16> Affected;
+  auto AddAffected = [&Affected](Value *V) {
+    if (isa<Argument>(V)) {
+      Affected.push_back(V);
+    } else if (auto *I = dyn_cast<Instruction>(V)) {
+      Affected.push_back(I);
+
+      if (I->getOpcode() == Instruction::BitCast ||
+          I->getOpcode() == Instruction::PtrToInt) {
+        auto *Op = I->getOperand(0);
+        if (isa<Instruction>(Op) || isa<Argument>(Op))
+          Affected.push_back(Op);
+      }
+    }
+  };
+
+  Value *Cond = CI->getArgOperand(0), *A, *B;
+  AddAffected(Cond);
+
+  CmpInst::Predicate Pred;
+  if (match(Cond, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+    AddAffected(A);
+    AddAffected(B);
+
+    if (Pred == ICmpInst::ICMP_EQ) {
+      // For equality comparisons, we handle the case of bit inversion.
+      auto AddAffectedFromEq = [&AddAffected](Value *V) {
+        Value *A;
+        if (match(V, m_Not(m_Value(A)))) {
+          AddAffected(A);
+          V = A;
+        }
+
+        Value *B;
+        ConstantInt *C;
+        // (A & B) or (A | B) or (A ^ B).
+        if (match(V,
+                  m_CombineOr(m_And(m_Value(A), m_Value(B)),
+                    m_CombineOr(m_Or(m_Value(A), m_Value(B)),
+                                m_Xor(m_Value(A), m_Value(B)))))) {
+          AddAffected(A);
+          AddAffected(B);
+        // (A << C) or (A >>_s C) or (A >>_u C) where C is some constant.
+        } else if (match(V,
+                         m_CombineOr(m_Shl(m_Value(A), m_ConstantInt(C)),
+                           m_CombineOr(m_LShr(m_Value(A), m_ConstantInt(C)),
+                                       m_AShr(m_Value(A),
+                                              m_ConstantInt(C)))))) {
+          AddAffected(A);
+        }
+      };
+
+      AddAffectedFromEq(A);
+      AddAffectedFromEq(B);
+    }
+  }
+
+  for (auto &AV : Affected) {
+    auto &AVV = getAffectedValues(AV);
+    if (std::find(AVV.begin(), AVV.end(), CI) == AVV.end())
+      AVV.push_back(CI);
+  }
+}
+
+void AssumptionCache::AffectedValueCallbackVH::deleted() {
+  auto AVI = AC->AffectedValues.find(getValPtr());
+  if (AVI != AC->AffectedValues.end())
+    AC->AffectedValues.erase(AVI);
+  // 'this' now dangles!
+}
+
+void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
+  if (!isa<Instruction>(NV) && !isa<Argument>(NV))
+    return;
+
+  // Any assumptions that affected this value now affect the new value.
+
+  auto &NAVV = AC->getAffectedValues(NV);
+  auto AVI = AC->AffectedValues.find(getValPtr());
+  if (AVI == AC->AffectedValues.end())
+    return;
+
+  for (auto &A : AVI->second)
+    if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
+      NAVV.push_back(A);
+}
+
 void AssumptionCache::scanFunction() {
   assert(!Scanned && "Tried to scan the function twice!");
   assert(AssumeHandles.empty() && "Already have assumes when scanning!");
@@ -37,6 +140,10 @@ void AssumptionCache::scanFunction() {
 
   // Mark the scan as complete.
   Scanned = true;
+
+  // Update affected values.
+  for (auto &A : AssumeHandles)
+    updateAffectedValues(cast<CallInst>(A));
 }
 
 void AssumptionCache::registerAssumption(CallInst *CI) {
@@ -72,6 +179,8 @@ void AssumptionCache::registerAssumption(CallInst *CI) {
            "Cache contains multiple copies of a call!");
   }
 #endif
+
+  updateAffectedValues(CI);
 }
 
 AnalysisKey AssumptionAnalysis::Key;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 08d50c29dfc8..d53364373d7b 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -44,10 +44,10 @@ add_llvm_library(LLVMAnalysis
   Lint.cpp
   Loads.cpp
   LoopAccessAnalysis.cpp
+  LoopAnalysisManager.cpp
   LoopUnrollAnalyzer.cpp
   LoopInfo.cpp
   LoopPass.cpp
-  LoopPassManager.cpp
   MemDepPrinter.cpp
   MemDerefPrinter.cpp
   MemoryBuiltins.cpp
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 67d1773f0811..6b77397956cd 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -438,8 +438,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
       getOperandInfo(I->getOperand(0));
     TargetTransformInfo::OperandValueKind Op2VK =
       getOperandInfo(I->getOperand(1));
+    SmallVector<const Value*, 2> Operands(I->operand_values()); 
     return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
-                                       Op2VK);
+                                       Op2VK, TargetTransformInfo::OP_None, 
+                                       TargetTransformInfo::OP_None, 
+                                       Operands);
   }
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 76e2561b9da3..a661b0101e6a 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -16,8 +16,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -36,20 +36,9 @@ using namespace llvm;
 
 AnalysisKey IVUsersAnalysis::Key;
 
-IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  return IVUsers(&L, FAM.getCachedResult<AssumptionAnalysis>(*F),
-                 FAM.getCachedResult<LoopAnalysis>(*F),
-                 FAM.getCachedResult<DominatorTreeAnalysis>(*F),
-                 FAM.getCachedResult<ScalarEvolutionAnalysis>(*F));
-}
-
-PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM) {
-  AM.getResult<IVUsersAnalysis>(L).print(OS);
-  return PreservedAnalyses::all();
+IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM,
+                             LoopStandardAnalysisResults &AR) {
+  return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE);
 }
 
 char IVUsersWrapperPass::ID = 0;
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 9b9faacd354c..4109049ecabc 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -636,30 +636,27 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
   else if (Caller->optForSize())
     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
 
-  bool HotCallsite = false;
-  uint64_t TotalWeight;
-  if (PSI && CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
-      PSI->isHotCount(TotalWeight)) {
-    HotCallsite = true;
+  // Adjust the threshold based on inlinehint attribute and profile based
+  // hotness information if the caller does not have MinSize attribute.
+  if (!Caller->optForMinSize()) {
+    if (Callee.hasFnAttribute(Attribute::InlineHint))
+      Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+    if (PSI) {
+      uint64_t TotalWeight;
+      if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
+          PSI->isHotCount(TotalWeight)) {
+        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
+      } else if (PSI->isFunctionEntryHot(&Callee)) {
+        // If callsite hotness can not be determined, we may still know
+        // that the callee is hot and treat it as a weaker hint for threshold
+        // increase.
+        Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+      } else if (PSI->isFunctionEntryCold(&Callee)) {
+        Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+      }
+    }
   }
 
-  // Listen to the inlinehint attribute or profile based hotness information
-  // when it would increase the threshold and the caller does not need to
-  // minimize its size.
-  bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
-                    (PSI && PSI->isFunctionEntryHot(&Callee));
-  if (InlineHint && !Caller->optForMinSize())
-    Threshold = MaxIfValid(Threshold, Params.HintThreshold);
-
-  if (HotCallsite && !Caller->optForMinSize())
-    Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
-
-  bool ColdCallee = PSI && PSI->isFunctionEntryCold(&Callee);
-  // For cold callees, use the ColdThreshold knob if it is available and reduces
-  // the threshold.
-  if (ColdCallee)
-    Threshold = MinIfValid(Threshold, Params.ColdThreshold);
-
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
   Threshold *= TTI.getInliningThresholdMultiplier();
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 8da2f0981d0c..796e6e444980 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3583,7 +3583,7 @@ static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
   }
-  
+
   return nullptr;
 }
 
@@ -3595,7 +3595,7 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *TrueVal,
   unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
   if (!BitWidth)
     return nullptr;
-  
+
   APInt MinSignedValue;
   Value *X;
   if (match(CmpLHS, m_Trunc(m_Value(X))) && (X == TrueVal || X == FalseVal)) {
@@ -4252,14 +4252,36 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
                                 const Query &Q, unsigned MaxRecurse) {
   Intrinsic::ID IID = F->getIntrinsicID();
   unsigned NumOperands = std::distance(ArgBegin, ArgEnd);
-  Type *ReturnType = F->getReturnType();
+
+  // Unary Ops
+  if (NumOperands == 1) {
+    // Perform idempotent optimizations
+    if (IsIdempotent(IID)) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin)) {
+        if (II->getIntrinsicID() == IID)
+          return II;
+      }
+    }
+
+    switch (IID) {
+    case Intrinsic::fabs: {
+      if (SignBitMustBeZero(*ArgBegin, Q.TLI))
+        return *ArgBegin;
+    }
+    default:
+      return nullptr;
+    }
+  }
 
   // Binary Ops
   if (NumOperands == 2) {
     Value *LHS = *ArgBegin;
     Value *RHS = *(ArgBegin + 1);
-    if (IID == Intrinsic::usub_with_overflow ||
-        IID == Intrinsic::ssub_with_overflow) {
+    Type *ReturnType = F->getReturnType();
+
+    switch (IID) {
+    case Intrinsic::usub_with_overflow:
+    case Intrinsic::ssub_with_overflow: {
       // X - X -> { 0, false }
       if (LHS == RHS)
         return Constant::getNullValue(ReturnType);
@@ -4268,17 +4290,19 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
       // undef - X -> undef
       if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
-    }
 
-    if (IID == Intrinsic::uadd_with_overflow ||
-        IID == Intrinsic::sadd_with_overflow) {
+      return nullptr;
+    }
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::sadd_with_overflow: {
       // X + undef -> undef
       if (isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
-    }
 
-    if (IID == Intrinsic::umul_with_overflow ||
-        IID == Intrinsic::smul_with_overflow) {
+      return nullptr;
+    }
+    case Intrinsic::umul_with_overflow:
+    case Intrinsic::smul_with_overflow: {
       // X * 0 -> { 0, false }
       if (match(RHS, m_Zero()))
         return Constant::getNullValue(ReturnType);
@@ -4286,34 +4310,34 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
       // X * undef -> { 0, false }
       if (match(RHS, m_Undef()))
         return Constant::getNullValue(ReturnType);
-    }
 
-    if (IID == Intrinsic::load_relative && isa<Constant>(LHS) &&
-        isa<Constant>(RHS))
-      return SimplifyRelativeLoad(cast<Constant>(LHS), cast<Constant>(RHS),
-                                  Q.DL);
+      return nullptr;
+    }
+    case Intrinsic::load_relative: {
+      Constant *C0 = dyn_cast<Constant>(LHS);
+      Constant *C1 = dyn_cast<Constant>(RHS);
+      if (C0 && C1)
+        return SimplifyRelativeLoad(C0, C1, Q.DL);
+      return nullptr;
+    }
+    default:
+      return nullptr;
+    }
   }
 
   // Simplify calls to llvm.masked.load.*
-  if (IID == Intrinsic::masked_load) {
+  switch (IID) {
+  case Intrinsic::masked_load: {
     Value *MaskArg = ArgBegin[2];
     Value *PassthruArg = ArgBegin[3];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
+    return nullptr;
   }
-
-  // Perform idempotent optimizations
-  if (!IsIdempotent(IID))
+  default:
     return nullptr;
-
-  // Unary Ops
-  if (NumOperands == 1)
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin))
-      if (II->getIntrinsicID() == IID)
-        return II;
-
-  return nullptr;
+  }
 }
 
 template <typename IterTy>
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 4f6355236873..d442310476cf 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -925,7 +925,7 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   if (!BBI)
     return;
 
-  for (auto &AssumeVH : AC->assumptions()) {
+  for (auto &AssumeVH : AC->assumptionsFor(Val)) {
     if (!AssumeVH)
       continue;
     auto *I = cast<CallInst>(AssumeVH);
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 2f3dca3d23fa..bf8007213097 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -12,22 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -44,10 +44,10 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -2120,35 +2120,9 @@ INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
 
 AnalysisKey LoopAccessAnalysis::Key;
 
-LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM) {
-  const FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function &F = *L.getHeader()->getParent();
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(F);
-  auto *AA = FAM.getCachedResult<AAManager>(F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(F);
-  if (!SE)
-    report_fatal_error(
-        "ScalarEvolution must have been cached at a higher level");
-  if (!AA)
-    report_fatal_error("AliasAnalysis must have been cached at a higher level");
-  if (!DT)
-    report_fatal_error("DominatorTree must have been cached at a higher level");
-  if (!LI)
-    report_fatal_error("LoopInfo must have been cached at a higher level");
-  return LoopAccessInfo(&L, SE, TLI, AA, DT, LI);
-}
-
-PreservedAnalyses LoopAccessInfoPrinterPass::run(Loop &L,
-                                                 LoopAnalysisManager &AM) {
-  Function &F = *L.getHeader()->getParent();
-  auto &LAI = AM.getResult<LoopAccessAnalysis>(L);
-  OS << "Loop access info in function '" << F.getName() << "':\n";
-  OS.indent(2) << L.getHeader()->getName() << ":\n";
-  LAI.print(OS, 4);
-  return PreservedAnalyses::all();
+LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
+                                       LoopStandardAnalysisResults &AR) {
+  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
 }
 
 namespace llvm {
diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
new file mode 100644
index 000000000000..5be3ee341c9c
--- /dev/null
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -0,0 +1,160 @@
+//===- LoopAnalysisManager.cpp - Loop analysis management -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class AllAnalysesOn<Loop>;
+template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
+template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
+template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                         LoopStandardAnalysisResults &>;
+
+bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // First compute the sequence of IR units covered by this proxy. We will want
+  // to visit this in postorder, but because this is a tree structure we can do
+  // this by building a preorder sequence and walking it in reverse.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+  // Note that we want to walk the roots in reverse order because we will end
+  // up reversing the preorder sequence. However, it happens that the loop nest
+  // roots are in reverse order within the LoopInfo object. So we just walk
+  // forward here.
+  // FIXME: If we change the order of LoopInfo we will want to add a reverse
+  // here.
+  for (Loop *RootL : *LI) {
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+  }
+
+  // If this proxy or the loop info is going to be invalidated, we also need
+  // to clear all the keys coming from that analysis. We also completely blow
+  // away the loop analyses if any of the standard analyses provided by the
+  // loop pass manager go away so that loop analyses can freely use these
+  // without worrying about declaring dependencies on them etc.
+  // FIXME: It isn't clear if this is the right tradeoff. We could instead make
+  // loop analyses declare any dependencies on these and use the more general
+  // invalidation logic below to act on that.
+  auto PAC = PA.getChecker<LoopAnalysisManagerFunctionProxy>();
+  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+      Inv.invalidate<AAManager>(F, PA) ||
+      Inv.invalidate<AssumptionAnalysis>(F, PA) ||
+      Inv.invalidate<DominatorTreeAnalysis>(F, PA) ||
+      Inv.invalidate<LoopAnalysis>(F, PA) ||
+      Inv.invalidate<ScalarEvolutionAnalysis>(F, PA)) {
+    // Note that the LoopInfo may be stale at this point, however the loop
+    // objects themselves remain the only viable keys that could be in the
+    // analysis manager's cache. So we just walk the keys and forcibly clear
+    // those results. Note that the order doesn't matter here as this will just
+    // directly destroy the results without calling methods on them.
+    for (Loop *L : PreOrderLoops)
+      InnerAM->clear(*L);
+
+    // We also need to null out the inner AM so that when the object gets
+    // destroyed as invalid we don't try to clear the inner AM again. At that
+    // point we won't be able to reliably walk the loops for this function and
+    // only clear results associated with those loops the way we do here.
+    // FIXME: Making InnerAM null at this point isn't very nice. Most analyses
+    // try to remain valid during invalidation. Maybe we should add an
+    // `IsClean` flag?
+    InnerAM = nullptr;
+
+    // Now return true to indicate this *is* invalid and a fresh proxy result
+    // needs to be built. This is especially important given the null InnerAM.
+    return true;
+  }
+
+  // Directly check if the relevant set is preserved so we can short circuit
+  // invalidating loops.
+  bool AreLoopAnalysesPreserved =
+      PA.allAnalysesInSetPreserved<AllAnalysesOn<Loop>>();
+
+  // Since we have a valid LoopInfo we can actually leave the cached results in
+  // the analysis manager associated with the Loop keys, but we need to
+  // propagate any necessary invalidation logic into them. We'd like to
+  // invalidate things in roughly the same order as they were put into the
+  // cache and so we walk the preorder list in reverse to form a valid
+  // postorder.
+  for (Loop *L : reverse(PreOrderLoops)) {
+    Optional<PreservedAnalyses> InnerPA;
+
+    // Check to see whether the preserved set needs to be adjusted based on
+    // function-level analysis invalidation triggering deferred invalidation
+    // for this loop.
+    if (auto *OuterProxy =
+            InnerAM->getCachedResult<FunctionAnalysisManagerLoopProxy>(*L))
+      for (const auto &OuterInvalidationPair :
+           OuterProxy->getOuterInvalidations()) {
+        AnalysisKey *OuterAnalysisID = OuterInvalidationPair.first;
+        const auto &InnerAnalysisIDs = OuterInvalidationPair.second;
+        if (Inv.invalidate(OuterAnalysisID, F, PA)) {
+          if (!InnerPA)
+            InnerPA = PA;
+          for (AnalysisKey *InnerAnalysisID : InnerAnalysisIDs)
+            InnerPA->abandon(InnerAnalysisID);
+        }
+      }
+
+    // Check if we needed a custom PA set. If so we'll need to run the inner
+    // invalidation.
+    if (InnerPA) {
+      InnerAM->invalidate(*L, *InnerPA);
+      continue;
+    }
+
+    // Otherwise we only need to do invalidation if the original PA set didn't
+    // preserve all Loop analyses.
+    if (!AreLoopAnalysesPreserved)
+      InnerAM->invalidate(*L, PA);
+  }
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
+template <>
+LoopAnalysisManagerFunctionProxy::Result
+LoopAnalysisManagerFunctionProxy::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  return Result(*InnerAM, AM.getResult<LoopAnalysis>(F));
+}
+}
+
+PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
+  PreservedAnalyses PA;
+  PA.preserve<AssumptionAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<LoopAnalysisManagerFunctionProxy>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  // TODO: What we really want to do here is preserve an AA category, but that
+  // concept doesn't exist yet.
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  return PA;
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 3d85ef6988a9..f449ce94d57c 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -689,18 +689,13 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
-PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
-    : OS(OS), Banner(Banner) {}
-
-PreservedAnalyses PrintLoopPass::run(Loop &L, AnalysisManager<Loop> &) {
+void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
   OS << Banner;
   for (auto *Block : L.blocks())
     if (Block)
       Block->print(OS);
     else
       OS << "Printing <null> block";
-  return PreservedAnalyses::all();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index b5b8040984d7..3f4a07942154 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
@@ -32,13 +32,14 @@ namespace {
 /// PrintLoopPass - Print a Function corresponding to a Loop.
 ///
 class PrintLoopPassWrapper : public LoopPass {
-  PrintLoopPass P;
+  raw_ostream &OS;
+  std::string Banner;
 
 public:
   static char ID;
-  PrintLoopPassWrapper() : LoopPass(ID) {}
+  PrintLoopPassWrapper() : LoopPass(ID), OS(dbgs()) {}
   PrintLoopPassWrapper(raw_ostream &OS, const std::string &Banner)
-      : LoopPass(ID), P(OS, Banner) {}
+      : LoopPass(ID), OS(OS), Banner(Banner) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -49,8 +50,7 @@ public:
                        [](BasicBlock *BB) { return BB; });
     if (BBI != L->blocks().end() &&
         isFunctionInPrintList((*BBI)->getParent()->getName())) {
-      LoopAnalysisManager DummyLAM;
-      P.run(*L, DummyLAM);
+      printLoop(*L, OS, Banner);
     }
     return false;
   }
diff --git a/lib/Analysis/LoopPassManager.cpp b/lib/Analysis/LoopPassManager.cpp
deleted file mode 100644
index 044e5d55dafd..000000000000
--- a/lib/Analysis/LoopPassManager.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===- LoopPassManager.cpp - Loop pass management -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/Dominators.h"
-
-using namespace llvm;
-
-// Explicit template instantiations and specialization defininitions for core
-// template typedefs.
-namespace llvm {
-template class PassManager<Loop>;
-template class AnalysisManager<Loop>;
-template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
-template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
-
-template <>
-bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
-    Function &F, const PreservedAnalyses &PA,
-    FunctionAnalysisManager::Invalidator &Inv) {
-  // If this proxy isn't marked as preserved, the set of Function objects in
-  // the module may have changed. We therefore can't call
-  // InnerAM->invalidate(), because any pointers to Functions it has may be
-  // stale.
-  auto PAC = PA.getChecker<LoopAnalysisManagerFunctionProxy>();
-  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Loop>>())
-    InnerAM->clear();
-
-  // FIXME: Proper suppor for invalidation isn't yet implemented for the LPM.
-
-  // Return false to indicate that this result is still a valid proxy.
-  return false;
-}
-}
-
-PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<ScalarEvolutionAnalysis>();
-  // TODO: What we really want to do here is preserve an AA category, but that
-  // concept doesn't exist yet.
-  PA.preserve<AAManager>();
-  PA.preserve<BasicAA>();
-  PA.preserve<GlobalsAA>();
-  PA.preserve<SCEVAA>();
-  return PA;
-}
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index e7415e623196..66a0d145dcd8 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -323,17 +323,28 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
 
+  MemDepResult InvariantGroupDependency = MemDepResult::getUnknown();
   if (QueryInst != nullptr) {
     if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
-      MemDepResult invariantGroupDependency =
-          getInvariantGroupPointerDependency(LI, BB);
+      InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB);
 
-      if (invariantGroupDependency.isDef())
-        return invariantGroupDependency;
+      if (InvariantGroupDependency.isDef())
+        return InvariantGroupDependency;
     }
   }
-  return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst,
-                                        Limit);
+  MemDepResult SimpleDep = getSimplePointerDependencyFrom(
+      MemLoc, isLoad, ScanIt, BB, QueryInst, Limit);
+  if (SimpleDep.isDef())
+    return SimpleDep;
+  // Non-local invariant group dependency indicates there is non local Def
+  // (it only returns nonLocal if it finds nonLocal def), which is better than
+  // local clobber and everything else.
+  if (InvariantGroupDependency.isNonLocal())
+    return InvariantGroupDependency;
+
+  assert(InvariantGroupDependency.isUnknown() &&
+         "InvariantGroupDependency should be only unknown at this point");
+  return SimpleDep;
 }
 
 MemDepResult
@@ -358,6 +369,20 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
   // Queue to process all pointers that are equivalent to load operand.
   SmallVector<const Value *, 8> LoadOperandsQueue;
   LoadOperandsQueue.push_back(LoadOperand);
+
+  Instruction *ClosestDependency = nullptr;
+  // Order of instructions in uses list is unpredictible. In order to always
+  // get the same result, we will look for the closest dominance.
+  auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) {
+    assert(Other && "Must call it with not null instruction");
+    if (Best == nullptr || DT.dominates(Best, Other))
+      return Other;
+    return Best;
+  };
+
+
+  // FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case
+  // we will see all the instructions. This should be fixed in MSSA.
   while (!LoadOperandsQueue.empty()) {
     const Value *Ptr = LoadOperandsQueue.pop_back_val();
     assert(Ptr && !isa<GlobalValue>(Ptr) &&
@@ -388,12 +413,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // If we hit load/store with the same invariant.group metadata (and the
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
-      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB &&
+      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
           U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
-        return MemDepResult::getDef(U);
+        ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
   }
-  return MemDepResult::getUnknown();
+
+  if (!ClosestDependency)
+    return MemDepResult::getUnknown();
+  if (ClosestDependency->getParent() == BB)
+    return MemDepResult::getDef(ClosestDependency);
+  // Def(U) can't be returned here because it is non-local. If local
+  // dependency won't be found then return nonLocal counting that the
+  // user will call getNonLocalPointerDependency, which will return cached
+  // result.
+  NonLocalDefsCache.try_emplace(
+      LI, NonLocalDepResult(ClosestDependency->getParent(),
+                            MemDepResult::getDef(ClosestDependency), nullptr));
+  return MemDepResult::getNonLocal();
 }
 
 MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
@@ -877,7 +914,17 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
   assert(Loc.Ptr->getType()->isPointerTy() &&
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
-
+  {
+    // Check if there is cached Def with invariant.group. FIXME: cache might be
+    // invalid if cached instruction would be removed between call to
+    // getPointerDependencyFrom and this function.
+    auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst);
+    if (NonLocalDefIt != NonLocalDefsCache.end()) {
+      Result.push_back(std::move(NonLocalDefIt->second));
+      NonLocalDefsCache.erase(NonLocalDefIt);
+      return;
+    }
+  }
   // This routine does not expect to deal with volatile instructions.
   // Doing so would require piping through the QueryInst all the way through.
   // TODO: volatiles can't be elided, but they can be reordered with other
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 44f1a6dde0d2..b3905cc01e84 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7032,20 +7032,21 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
   // modulo (N / D).
   //
-  // (N / D) may need BW+1 bits in its representation.  Hence, we'll use this
-  // bit width during computations.
+  // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
+  // (N / D) in general. The inverse itself always fits into BW bits, though,
+  // so we immediately truncate it.
   APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
   APInt Mod(BW + 1, 0);
   Mod.setBit(BW - Mult2);  // Mod = N / D
-  APInt I = AD.multiplicativeInverse(Mod);
+  APInt I = AD.multiplicativeInverse(Mod).trunc(BW);
 
   // 4. Compute the minimum unsigned root of the equation:
   // I * (B / D) mod (N / D)
-  APInt Result = (I * B.lshr(Mult2).zext(BW + 1)).urem(Mod);
+  // To simplify the computation, we factor out the divide by D:
+  // (I * B mod N) / D
+  APInt Result = (I * B).lshr(Mult2);
 
-  // The result is guaranteed to be less than 2^BW so we may truncate it to BW
-  // bits.
-  return SE.getConstant(Result.trunc(BW));
+  return SE.getConstant(Result);
 }
 
 /// Find the roots of the quadratic equation for the given quadratic chrec
@@ -7206,17 +7207,25 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
   if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
-    ConstantRange CR = getUnsignedRange(Start);
-    const SCEV *MaxBECount;
-    if (!CountDown && CR.getUnsignedMin().isMinValue())
-      // When counting up, the worst starting value is 1, not 0.
-      MaxBECount = CR.getUnsignedMax().isMinValue()
-        ? getConstant(APInt::getMinValue(CR.getBitWidth()))
-        : getConstant(APInt::getMaxValue(CR.getBitWidth()));
-    else
-      MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
-                                         : -CR.getUnsignedMin());
-    return ExitLimit(Distance, MaxBECount, false, Predicates);
+    APInt MaxBECount = getUnsignedRange(Distance).getUnsignedMax();
+
+    // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
+    // we end up with a loop whose backedge-taken count is n - 1.  Detect this
+    // case, and see if we can improve the bound.
+    //
+    // Explicitly handling this here is necessary because getUnsignedRange
+    // isn't context-sensitive; it doesn't know that we only care about the
+    // range inside the loop.
+    const SCEV *Zero = getZero(Distance->getType());
+    const SCEV *One = getOne(Distance->getType());
+    const SCEV *DistancePlusOne = getAddExpr(Distance, One);
+    if (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, DistancePlusOne, Zero)) {
+      // If Distance + 1 doesn't overflow, we can compute the maximum distance
+      // as "unsigned_max(Distance + 1) - 1".
+      ConstantRange CR = getUnsignedRange(DistancePlusOne);
+      MaxBECount = APIntOps::umin(MaxBECount, CR.getUnsignedMax() - 1);
+    }
+    return ExitLimit(Distance, getConstant(MaxBECount), false, Predicates);
   }
 
   // As a special case, handle the instance where Step is a positive power of
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index cd8c24630df1..5c0d1aac1b98 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -277,9 +277,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
 int TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+    OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) const {
   int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                             Opd1PropInfo, Opd2PropInfo);
+                                             Opd1PropInfo, Opd2PropInfo, Args);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index d31472c0d33c..b79370baad10 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -526,7 +526,10 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
   unsigned BitWidth = KnownZero.getBitWidth();
 
-  for (auto &AssumeVH : Q.AC->assumptions()) {
+  // Note that the patterns below need to be kept in sync with the code
+  // in AssumptionCache::updateAffectedValues.
+
+  for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
@@ -2580,51 +2583,70 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
   return false;
 }
 
-bool llvm::CannotBeOrderedLessThanZero(const Value *V,
-                                       const TargetLibraryInfo *TLI,
-                                       unsigned Depth) {
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
-    return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();
+/// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
+/// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
+/// bit despite comparing equal.
+static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
+                                            const TargetLibraryInfo *TLI,
+                                            bool SignBitOnly,
+                                            unsigned Depth) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    return !CFP->getValueAPF().isNegative() ||
+           (!SignBitOnly && CFP->getValueAPF().isZero());
+  }
 
   if (Depth == MaxDepth)
-    return false;  // Limit search depth.
+    return false; // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
-  if (!I) return false;
+  if (!I)
+    return false;
 
   switch (I->getOpcode()) {
-  default: break;
+  default:
+    break;
   // Unsigned integers are always nonnegative.
   case Instruction::UIToFP:
     return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
-    if (I->getOperand(0) == I->getOperand(1))
+    if (I->getOperand(0) == I->getOperand(1) &&
+        (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()))
       return true;
+
     LLVM_FALLTHROUGH;
   case Instruction::FAdd:
   case Instruction::FDiv:
   case Instruction::FRem:
-    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                           Depth + 1) &&
+           cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::Select:
-    return CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                           Depth + 1) &&
+           cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
-    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::Call:
     Intrinsic::ID IID = getIntrinsicForCallSite(cast<CallInst>(I), TLI);
     switch (IID) {
     default:
       break;
     case Intrinsic::maxnum:
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) ||
-             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) ||
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::minnum:
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
-             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) &&
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
@@ -2636,18 +2658,30 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V,
         if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
           return true;
       }
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
       return I->getOperand(0) == I->getOperand(1) &&
-             CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
+             (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()) &&
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
+                                             Depth + 1);
     }
     break;
   }
   return false;
 }
 
+bool llvm::CannotBeOrderedLessThanZero(const Value *V,
+                                       const TargetLibraryInfo *TLI) {
+  return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
+}
+
+bool llvm::SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI) {
+  return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
+}
+
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 54aa0a9e3282..76549540ce0f 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -22,3 +22,4 @@ add_subdirectory(ProfileData)
 add_subdirectory(Fuzzer)
 add_subdirectory(Passes)
 add_subdirectory(LibDriver)
+add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 408b34a3cdc0..83440513225c 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -13,11 +13,13 @@
 
 #include "CodeViewDebug.h"
 #include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
@@ -467,7 +469,8 @@ void CodeViewDebug::emitTypeInformation() {
     CommentPrefix += ' ';
   }
 
-  CVTypeDumper CVTD(nullptr, /*PrintRecordBytes=*/false);
+  TypeDatabase TypeDB;
+  CVTypeDumper CVTD(TypeDB);
   TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) {
     if (OS.isVerboseAsm()) {
       // Emit a block comment describing the type record for readability.
@@ -475,8 +478,8 @@ void CodeViewDebug::emitTypeInformation() {
       raw_svector_ostream CommentOS(CommentBlock);
       ScopedPrinter SP(CommentOS);
       SP.setPrefix(CommentPrefix);
-      CVTD.setPrinter(&SP);
-      Error E = CVTD.dump(Record);
+      TypeDumpVisitor TDV(TypeDB, &SP, false);
+      Error E = CVTD.dump(Record, TDV);
       if (E) {
         logAllUnhandledErrors(std::move(E), errs(), "error: ");
         llvm_unreachable("produced malformed type record");
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 8ae2f2487cad..a8a3b30d5b60 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -79,6 +79,13 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
     // Emit form type.
     AP->EmitULEB128(AttrData.getForm(),
                     dwarf::FormEncodingString(AttrData.getForm()).data());
+
+    // Emit value for DW_FORM_implicit_const.
+    if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) {
+      assert(AP->getDwarfVersion() >= 5 &&
+            "DW_FORM_implicit_const is supported starting from DWARFv5");
+      AP->EmitSLEB128(AttrData.getValue());
+    }
   }
 
   // Mark end of abbreviation.
@@ -160,7 +167,11 @@ DIE *DIE::getParent() const {
 DIEAbbrev DIE::generateAbbrev() const {
   DIEAbbrev Abbrev(Tag, hasChildren());
   for (const DIEValue &V : values())
-    Abbrev.AddAttribute(V.getAttribute(), V.getForm());
+    if (V.getForm() == dwarf::DW_FORM_implicit_const)
+      Abbrev.AddImplicitConstAttribute(V.getAttribute(),
+                                       V.getDIEInteger().getValue());
+    else
+      Abbrev.AddAttribute(V.getAttribute(), V.getForm());
   return Abbrev;
 }
 
@@ -342,6 +353,8 @@ void DIEValue::dump() const {
 ///
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_implicit_const:
+    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
@@ -406,6 +419,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 4f90245c6d49..2a866c071f59 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -200,6 +200,8 @@ void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, uint64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(false, Integer);
+  assert(Form != dwarf::DW_FORM_implicit_const &&
+         "DW_FORM_implicit_const is used only for signed integers");
   Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
 }
 
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 04bb7ca5ba9e..cc026ef27296 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -223,6 +223,7 @@ RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
   for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) {
     MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost);
     if (CurCost < Cost) {
+      DEBUG(dbgs() << "New best: " << CurCost << '\n');
       Cost = CurCost;
       BestMapping = &CurMapping;
       RepairPts.clear();
@@ -377,8 +378,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
   DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
   DEBUG(dbgs() << "With: " << InstrMapping << '\n');
   RepairPts.clear();
-  if (BestCost && Cost > *BestCost)
+  if (BestCost && Cost > *BestCost) {
+    DEBUG(dbgs() << "Mapping is too expensive from the start\n");
     return Cost;
+  }
 
   // Moreover, to realize this mapping, the register bank of each operand must
   // match this mapping. In other words, we may need to locally reassign the
@@ -392,17 +395,17 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
-    DEBUG(dbgs() << "Opd" << OpIdx);
+    DEBUG(dbgs() << "Opd" << OpIdx << '\n');
     const RegisterBankInfo::ValueMapping &ValMapping =
         InstrMapping.getOperandMapping(OpIdx);
     // If Reg is already properly mapped, this is free.
     bool Assign;
     if (assignmentMatch(Reg, ValMapping, Assign)) {
-      DEBUG(dbgs() << " is free (match).\n");
+      DEBUG(dbgs() << "=> is free (match).\n");
       continue;
     }
     if (Assign) {
-      DEBUG(dbgs() << " is free (simple assignment).\n");
+      DEBUG(dbgs() << "=> is free (simple assignment).\n");
       RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
                                                 RepairingPlacement::Reassign));
       continue;
@@ -420,8 +423,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
       tryAvoidingSplit(RepairPt, MO, ValMapping);
 
     // Check that the materialization of the repairing is possible.
-    if (!RepairPt.canMaterialize())
+    if (!RepairPt.canMaterialize()) {
+      DEBUG(dbgs() << "Mapping involves impossible repairing\n");
       return MappingCost::ImpossibleCost();
+    }
 
     // Account for the split cost and repair cost.
     // Unless the cost is already saturated or we do not care about the cost.
@@ -476,8 +481,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
       // Stop looking into what it takes to repair, this is already
       // too expensive.
-      if (BestCost && Cost > *BestCost)
+      if (BestCost && Cost > *BestCost) {
+        DEBUG(dbgs() << "Mapping is too expensive, stop processing\n");
         return Cost;
+      }
 
       // No need to accumulate more cost information.
       // We need to still gather the repairing information though.
@@ -485,6 +492,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
         break;
     }
   }
+  DEBUG(dbgs() << "Total cost is: " << Cost << "\n");
   return Cost;
 }
 
@@ -550,7 +558,7 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
   // Make sure the mapping is valid for MI.
   assert(BestMapping.verify(MI) && "Invalid instruction mapping");
 
-  DEBUG(dbgs() << "Mapping: " << BestMapping << '\n');
+  DEBUG(dbgs() << "Best Mapping: " << BestMapping << '\n');
 
   // After this call, MI may not be valid anymore.
   // Do not use it.
@@ -959,3 +967,20 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
   return LocalCost == Cost.LocalCost && NonLocalCost == Cost.NonLocalCost &&
          LocalFreq == Cost.LocalFreq;
 }
+
+void RegBankSelect::MappingCost::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegBankSelect::MappingCost::print(raw_ostream &OS) const {
+  if (*this == ImpossibleCost()) {
+    OS << "impossible";
+    return;
+  }
+  if (isSaturated()) {
+    OS << "saturated";
+    return;
+  }
+  OS << LocalFreq << " * " << LocalCost << " + " << NonLocalCost;
+}
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 0ffc08188ead..49d676f11da6 100644
--- a/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -19,12 +19,15 @@ using namespace llvm;
 
 const unsigned RegisterBank::InvalidID = UINT_MAX;
 
-RegisterBank::RegisterBank() : ID(InvalidID), Name(nullptr), Size(0) {}
+RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size,
+                           const uint32_t *CoveredClasses)
+    : ID(ID), Name(Name), Size(Size) {
+  ContainedRegClasses.resize(200);
+  ContainedRegClasses.setBitsInMask(CoveredClasses);
+}
 
 bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
   assert(isValid() && "Invalid register bank");
-  assert(ContainedRegClasses.size() == TRI.getNumRegClasses() &&
-         "TRI does not match the initialization process?");
   for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
     const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 7d405dd92ac3..da5ab0b9fb7b 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -56,8 +56,10 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
                                    unsigned NumRegBanks)
     : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
 #ifndef NDEBUG
-  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx)
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
     assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
+    assert(RegBanks[Idx]->isValid() && "RegisterBank should be valid");
+  }
 #endif // NDEBUG
 }
 
@@ -74,116 +76,13 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
     const RegisterBank &RegBank = getRegBank(Idx);
     assert(Idx == RegBank.getID() &&
            "ID does not match the index in the array");
-    dbgs() << "Verify " << RegBank << '\n';
+    DEBUG(dbgs() << "Verify " << RegBank << '\n');
     assert(RegBank.verify(TRI) && "RegBank is invalid");
   }
 #endif // NDEBUG
   return true;
 }
 
-void RegisterBankInfo::createRegisterBank(unsigned ID, const char *Name) {
-  DEBUG(dbgs() << "Create register bank: " << ID << " with name \"" << Name
-               << "\"\n");
-  RegisterBank &RegBank = getRegBank(ID);
-  assert(RegBank.getID() == RegisterBank::InvalidID &&
-         "A register bank should be created only once");
-  RegBank.ID = ID;
-  RegBank.Name = Name;
-}
-
-void RegisterBankInfo::addRegBankCoverage(unsigned ID, unsigned RCId,
-                                          const TargetRegisterInfo &TRI) {
-  RegisterBank &RB = getRegBank(ID);
-  unsigned NbOfRegClasses = TRI.getNumRegClasses();
-
-  DEBUG(dbgs() << "Add coverage for: " << RB << '\n');
-
-  // Check if RB is underconstruction.
-  if (!RB.isValid())
-    RB.ContainedRegClasses.resize(NbOfRegClasses);
-  else if (RB.covers(*TRI.getRegClass(RCId)))
-    // If RB already covers this register class, there is nothing
-    // to do.
-    return;
-
-  BitVector &Covered = RB.ContainedRegClasses;
-  SmallVector<unsigned, 8> WorkList;
-
-  WorkList.push_back(RCId);
-  Covered.set(RCId);
-
-  unsigned &MaxSize = RB.Size;
-  do {
-    unsigned RCId = WorkList.pop_back_val();
-
-    const TargetRegisterClass &CurRC = *TRI.getRegClass(RCId);
-
-    DEBUG(dbgs() << "Examine: " << TRI.getRegClassName(&CurRC)
-                 << "(Size*8: " << (CurRC.getSize() * 8) << ")\n");
-
-    // Remember the biggest size in bits.
-    MaxSize = std::max(MaxSize, CurRC.getSize() * 8);
-
-    // Walk through all sub register classes and push them into the worklist.
-    bool First = true;
-    for (BitMaskClassIterator It(CurRC.getSubClassMask(), TRI); It.isValid();
-         ++It) {
-      unsigned SubRCId = It.getID();
-      if (!Covered.test(SubRCId)) {
-        if (First)
-          DEBUG(dbgs() << "  Enqueue sub-class: ");
-        DEBUG(dbgs() << TRI.getRegClassName(TRI.getRegClass(SubRCId)) << ", ");
-        WorkList.push_back(SubRCId);
-        // Remember that we saw the sub class.
-        Covered.set(SubRCId);
-        First = false;
-      }
-    }
-    if (!First)
-      DEBUG(dbgs() << '\n');
-
-    // Push also all the register classes that can be accessed via a
-    // subreg index, i.e., its subreg-class (which is different than
-    // its subclass).
-    //
-    // Note: It would probably be faster to go the other way around
-    // and have this method add only super classes, since this
-    // information is available in a more efficient way. However, it
-    // feels less natural for the client of this APIs plus we will
-    // TableGen the whole bitset at some point, so compile time for
-    // the initialization is not very important.
-    First = true;
-    for (unsigned SubRCId = 0; SubRCId < NbOfRegClasses; ++SubRCId) {
-      if (Covered.test(SubRCId))
-        continue;
-      bool Pushed = false;
-      const TargetRegisterClass *SubRC = TRI.getRegClass(SubRCId);
-      for (SuperRegClassIterator SuperRCIt(SubRC, &TRI); SuperRCIt.isValid();
-           ++SuperRCIt) {
-        if (Pushed)
-          break;
-        for (BitMaskClassIterator It(SuperRCIt.getMask(), TRI); It.isValid();
-             ++It) {
-          unsigned SuperRCId = It.getID();
-          if (SuperRCId == RCId) {
-            if (First)
-              DEBUG(dbgs() << "  Enqueue subreg-class: ");
-            DEBUG(dbgs() << TRI.getRegClassName(SubRC) << ", ");
-            WorkList.push_back(SubRCId);
-            // Remember that we saw the sub class.
-            Covered.set(SubRCId);
-            Pushed = true;
-            First = false;
-            break;
-          }
-        }
-      }
-    }
-    if (!First)
-      DEBUG(dbgs() << '\n');
-  } while (!WorkList.empty());
-}
-
 const RegisterBank *
 RegisterBankInfo::getRegBank(unsigned Reg, const MachineRegisterInfo &MRI,
                              const TargetRegisterInfo &TRI) const {
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index d2ce001103df..2f2e3b3d8e9f 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -1840,7 +1840,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
         OS << "!\"" << DIV->getName() << '\"';
       else
         MO.print(OS, MST, TRI);
-    } else if (TRI && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+    } else if (TRI && (isInsertSubreg() || isRegSequence() ||
+                       (isSubregToReg() && i == 3)) && MO.isImm()) {
       OS << TRI->getSubRegIndexName(MO.getImm());
     } else if (i == AsmDescOp && MO.isImm()) {
       // Pretty print the inline asm operand descriptor.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 11af50fe577c..6d643457e9a9 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1715,7 +1715,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
   // Bitcasts with more than one def are not supported.
   if (Def->getDesc().getNumDefs() != 1)
     return ValueTrackerResult();
-  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+  const MachineOperand DefOp = Def->getOperand(DefIdx);
+  if (DefOp.getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of the src.
     // Bails as we do not support composing subregs yet.
     return ValueTrackerResult();
@@ -1735,6 +1736,14 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
       return ValueTrackerResult();
     SrcIdx = OpIdx;
   }
+
+  // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
+  // will break the assumed guarantees for the upper bits.
+  for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
+    if (UseMI.isSubregToReg())
+      return ValueTrackerResult();
+  }
+
   const MachineOperand &Src = Def->getOperand(SrcIdx);
   return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 1f0c3283ceb1..427d95268c74 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -310,19 +310,19 @@ void SUnit::biasCriticalPath() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static void dumpSUIdentifier(const ScheduleDAG &DAG, const SUnit &SU) {
-  if (&SU == &DAG.ExitSU)
-    dbgs() << "ExitSU";
-  else if (&SU == &DAG.EntrySU)
-    dbgs() << "EntrySU";
+void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
+  if (this == &DAG->ExitSU)
+    OS << "ExitSU";
+  else if (this == &DAG->EntrySU)
+    OS << "EntrySU";
   else
-    dbgs() << "SU(" << SU.NodeNum << ")";
+    OS << "SU(" << NodeNum << ")";
 }
 
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
 void SUnit::dump(const ScheduleDAG *G) const {
-  dumpSUIdentifier(*G, *this);
+  print(dbgs(), G);
   dbgs() << ": ";
   G->dumpNode(this);
 }
@@ -352,7 +352,7 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      dumpSUIdentifier(*G, *I->getSUnit());
+      I->getSUnit()->print(dbgs(), G);
       if (I->isArtificial())
         dbgs() << " *";
       dbgs() << ": Latency=" << I->getLatency();
@@ -372,7 +372,7 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      dumpSUIdentifier(*G, *I->getSUnit());
+      I->getSUnit()->print(dbgs(), G);
       if (I->isArtificial())
         dbgs() << " *";
       dbgs() << ": Latency=" << I->getLatency();
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4632484055d2..680f62fa91bc 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5361,8 +5361,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // fold (select false, X, Y) -> Y
     return !N0C->isNullValue() ? N1 : N2;
   }
-  // fold (select C, 1, X) -> (or C, X)
-  if (VT == MVT::i1 && isOneConstant(N1))
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or C, Y)
+  if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
 
   if (SDValue V = foldSelectOfConstants(N))
@@ -5380,16 +5381,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
   }
-  // fold (select C, X, 0) -> (and C, X)
-  if (VT == MVT::i1 && isNullConstant(N2))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
-  // fold (select X, X, Y) -> (or X, Y)
-  // fold (select X, 1, Y) -> (or X, Y)
-  if (VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select X, Y, X) -> (and X, Y)
   // fold (select X, Y, 0) -> (and X, Y)
-  if (VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
+  if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
 
   // If we can fold this based on the true/false value, do so.
@@ -5470,7 +5464,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   }
 
   // select (xor Cond, 1), X, Y -> select Cond, Y, X
-  // select (xor Cond, 0), X, Y -> selext Cond, X, Y
   if (VT0 == MVT::i1) {
     if (N0->getOpcode() == ISD::XOR) {
       if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
@@ -5478,9 +5471,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
         if (C->isOne())
           return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(),
                              Cond0, N2, N1);
-        else
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(),
-                             Cond0, N1, N2);
       }
     }
   }
@@ -8136,7 +8126,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if ((AllowFusion || HasFMAD)  && Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
     if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+        N0.getOperand(2).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8147,7 +8138,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
     // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
     if (N1->getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+        N1.getOperand(2).getOpcode() == ISD::FMUL &&
+        N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N1.getOperand(0), N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8379,7 +8371,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+        N0.getOperand(2).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3485e35e6f5d..b0028252836a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -330,8 +330,6 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
   // supported by the target.
   EVT VT    = Tmp1.getValueType();
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = Tmp3.getValueType();
-  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   SDValue StackPtr = DAG.CreateStackTemporary(VT);
 
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
@@ -341,13 +339,8 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
       DAG.getEntryNode(), dl, Tmp1, StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 
-  // Truncate or zero extend offset to target pointer type.
-  Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT);
-  // Add the offset to the index.
-  unsigned EltSize = EltVT.getSizeInBits()/8;
-  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,
-                     DAG.getConstant(EltSize, dl, IdxVT));
-  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);
+
   // Store the scalar value.
   Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
   // Load the updated vector.
@@ -1209,20 +1202,16 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
     }
   }
 
+  EVT VecVT = Vec.getValueType();
+
   if (!Ch.getNode()) {
     // Store the value to a temporary stack slot, then LOAD the returned part.
-    StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+    StackPtr = DAG.CreateStackTemporary(VecVT);
     Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
                       MachinePointerInfo());
   }
 
-  // Add the offset to the index.
-  unsigned EltSize = Vec.getScalarValueSizeInBits() / 8;
-  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
-                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
-
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
-  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   SDValue NewLoad;
 
@@ -1232,7 +1221,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   else
     NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                              MachinePointerInfo(),
-                             Vec.getValueType().getVectorElementType());
+                             VecVT.getVectorElementType());
 
   // Replace the chain going out of the store, by the one out of the load.
   DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
@@ -1256,8 +1245,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDLoc dl(Op);
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
-
-  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  EVT VecVT = Vec.getValueType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
@@ -1266,16 +1255,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
-
-  // Add the offset to the index.
-  unsigned EltSize = Vec.getScalarValueSizeInBits() / 8;
-
-  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
-                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
-
-  SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
-                                    StackPtr);
+  SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   // Store the subvector.
   Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6b62f11f1240..dc436ce04514 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -57,8 +57,6 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
   case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
-  case ISD::CONVERT_RNDSAT:
-                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
   case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
@@ -354,18 +352,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   return Result;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
-           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
-           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
-          "can only promote integers");
-  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0),
-                              N->getOperand(1), N->getOperand(2),
-                              N->getOperand(3), N->getOperand(4), CvtCode);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
@@ -512,7 +498,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
                    N->getIndex()};
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand()); 
+                                    N->getMemOperand());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -887,8 +873,6 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
-  case ISD::CONVERT_RNDSAT:
-                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
   case ISD::INSERT_VECTOR_ELT:
                           Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
   case ISD::SCALAR_TO_VECTOR:
@@ -1068,18 +1052,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
-           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
-           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
-           "can only promote integer arguments");
-  SDValue InOp = GetPromotedInteger(N->getOperand(0));
-  return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp,
-                              N->getOperand(1), N->getOperand(2),
-                              N->getOperand(3), N->getOperand(4), CvtCode);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
                                                          unsigned OpNo) {
   if (OpNo == 1) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 693f5e2120a7..cf19d75676cd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1021,22 +1021,6 @@ void DAGTypeLegalizer::GetPairElements(SDValue Pair,
                    DAG.getIntPtrConstant(1, dl));
 }
 
-SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
-                                                  SDValue Index) {
-  SDLoc dl(Index);
-  // Make sure the index type is big enough to compute in.
-  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy(DAG.getDataLayout()));
-
-  // Calculate the element offset and add it to the pointer.
-  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
-  assert(EltSize * 8 == EltVT.getSizeInBits() &&
-         "Converting bits to bytes lost precision");
-
-  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
-                      DAG.getConstant(EltSize, dl, Index.getValueType()));
-  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
-}
-
 /// Build an integer with low bits Lo and high bits Hi.
 SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   // Arbitrarily use dlHi for result SDLoc
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d1022af69477..ec55662d75c0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -173,7 +173,6 @@ private:
   /// input operand is returned.
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
-  SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
   SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
 
@@ -250,7 +249,6 @@ private:
   SDValue PromoteIntRes_BITREVERSE(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
-  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntRes_CTLZ(SDNode *N);
   SDValue PromoteIntRes_CTPOP(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
@@ -289,7 +287,6 @@ private:
   SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
-  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
@@ -603,7 +600,6 @@ private:
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
-  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
   SDValue ScalarizeVecRes_FPOWI(SDNode *N);
@@ -709,7 +705,6 @@ private:
   SDValue WidenVecRes_BITCAST(SDNode* N);
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
-  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
   SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 57c179ac15b8..27a9ac337f25 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -51,7 +51,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
-  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
   case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
@@ -179,17 +178,6 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
   return InOp;
 }
 
-SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
-  EVT NewVT = N->getValueType(0).getVectorElementType();
-  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
-  return DAG.getConvertRndSat(NewVT, SDLoc(N),
-                              Op0, DAG.getValueType(NewVT),
-                              DAG.getValueType(Op0.getValueType()),
-                              N->getOperand(3),
-                              N->getOperand(4),
-                              cast<CvtRndSatSDNode>(N)->getCvtCode());
-}
-
 SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0).getVectorElementType(),
@@ -621,7 +609,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::BITREVERSE:
   case ISD::BSWAP:
-  case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
   case ISD::CTTZ:
   case ISD::CTLZ_ZERO_UNDEF:
@@ -846,7 +833,6 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
   GetSplitVector(Vec, Lo, Hi);
 
   EVT VecVT = Vec.getValueType();
-  EVT VecElemVT = VecVT.getVectorElementType();
   unsigned VecElems = VecVT.getVectorNumElements();
   unsigned SubElems = SubVec.getValueType().getVectorNumElements();
 
@@ -872,7 +858,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
       DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Store the new subvector into the specified index.
-  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx);
+  SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
@@ -1003,7 +989,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
-  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store =
@@ -1236,18 +1222,6 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
-  } else if (N->getOpcode() == ISD::CONVERT_RNDSAT) {
-    SDValue DTyOpLo = DAG.getValueType(LoVT);
-    SDValue DTyOpHi = DAG.getValueType(HiVT);
-    SDValue STyOpLo = DAG.getValueType(Lo.getValueType());
-    SDValue STyOpHi = DAG.getValueType(Hi.getValueType());
-    SDValue RndOp = N->getOperand(3);
-    SDValue SatOp = N->getOperand(4);
-    ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-    Lo = DAG.getConvertRndSat(LoVT, dl, Lo, DTyOpLo, STyOpLo, RndOp, SatOp,
-                              CvtCode);
-    Hi = DAG.getConvertRndSat(HiVT, dl, Hi, DTyOpHi, STyOpHi, RndOp, SatOp,
-                              CvtCode);
   } else {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
@@ -1650,7 +1624,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
       DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Load back the required element.
-  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
                         MachinePointerInfo(), EltVT);
 }
@@ -2045,7 +2019,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
-  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
@@ -2693,86 +2666,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
-  SDLoc dl(N);
-  SDValue InOp  = N->getOperand(0);
-  SDValue RndOp = N->getOperand(3);
-  SDValue SatOp = N->getOperand(4);
-
-  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
-
-  EVT InVT = InOp.getValueType();
-  EVT InEltVT = InVT.getVectorElementType();
-  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
-
-  SDValue DTyOp = DAG.getValueType(WidenVT);
-  SDValue STyOp = DAG.getValueType(InWidenVT);
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-
-  unsigned InVTNumElts = InVT.getVectorNumElements();
-  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
-    InOp = GetWidenedVector(InOp);
-    InVT = InOp.getValueType();
-    InVTNumElts = InVT.getVectorNumElements();
-    if (InVTNumElts == WidenNumElts)
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-  }
-
-  if (TLI.isTypeLegal(InWidenVT)) {
-    // Because the result and the input are different vector types, widening
-    // the result could create a legal type but widening the input might make
-    // it an illegal type that might lead to repeatedly splitting the input
-    // and then widening it. To avoid this, we widen the input only if
-    // it results in a legal type.
-    if (WidenNumElts % InVTNumElts == 0) {
-      // Widen the input and call convert on the widened input vector.
-      unsigned NumConcat = WidenNumElts/InVTNumElts;
-      SmallVector<SDValue, 16> Ops(NumConcat);
-      Ops[0] = InOp;
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      for (unsigned i = 1; i != NumConcat; ++i)
-        Ops[i] = UndefVal;
-
-      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, Ops);
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-    }
-
-    if (InVTNumElts % WidenNumElts == 0) {
-      // Extract the input and convert the shorten input vector.
-      InOp = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
-          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-    }
-  }
-
-  // Otherwise unroll into some nasty scalar code and rebuild the vector.
-  SmallVector<SDValue, 16> Ops(WidenNumElts);
-  EVT EltVT = WidenVT.getVectorElementType();
-  DTyOp = DAG.getValueType(EltVT);
-  STyOp = DAG.getValueType(InEltVT);
-
-  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
-  unsigned i;
-  for (i=0; i < MinElts; ++i) {
-    SDValue ExtVal = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-  }
-
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
-  for (; i < WidenNumElts; ++i)
-    Ops[i] = UndefVal;
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
-}
-
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT      VT = N->getValueType(0);
   EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5cc806668b12..a058942c5689 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -15,10 +15,20 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 namespace llvm {
+
+class InstrItineraryData;
+
   /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
   ///
   /// Edges between SUnits are initially based on edges in the SelectionDAG,
@@ -44,7 +54,7 @@ namespace llvm {
 
     explicit ScheduleDAGSDNodes(MachineFunction &mf);
 
-    ~ScheduleDAGSDNodes() override {}
+    ~ScheduleDAGSDNodes() override = default;
 
     /// Run - perform scheduling.
     ///
@@ -131,6 +141,7 @@ namespace llvm {
       unsigned DefIdx;
       unsigned NodeNumDefs;
       MVT ValueType;
+
     public:
       RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
 
@@ -150,6 +161,7 @@ namespace llvm {
       }
 
       void Advance();
+
     private:
       void InitNodeNumDefs();
     };
@@ -175,6 +187,7 @@ namespace llvm {
     void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
                          MachineBasicBlock::iterator InsertPos);
   };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b970dc0e5f5f..e225ba8703b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1104,7 +1104,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
       TargetLowering::TypePromoteInteger) {
    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-   APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
+   APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
   // In other cases the element type is illegal and needs to be expanded, for
@@ -1130,7 +1130,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
     SmallVector<SDValue, 2> EltParts;
     for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
       EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
-                                           .trunc(ViaEltSizeInBits), DL,
+                                           .zextOrTrunc(ViaEltSizeInBits), DL,
                                      ViaEltVT, isT, isO));
     }
 
@@ -1629,31 +1629,6 @@ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
   return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
 }
 
-SDValue SelectionDAG::getConvertRndSat(EVT VT, const SDLoc &dl, SDValue Val,
-                                       SDValue DTy, SDValue STy, SDValue Rnd,
-                                       SDValue Sat, ISD::CvtCode Code) {
-  // If the src and dest types are the same and the conversion is between
-  // integer types of the same sign or two floats, no conversion is necessary.
-  if (DTy == STy &&
-      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
-    return Val;
-
-  FoldingSetNodeID ID;
-  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
-  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops);
-  void* IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
-    return SDValue(E, 0);
-
-  auto *N =
-      newSDNode<CvtRndSatSDNode>(VT, dl.getIROrder(), dl.getDebugLoc(), Code);
-  createOperands(N, Ops);
-
-  CSEMap.InsertNode(N, IP);
-  InsertNode(N);
-  return SDValue(N, 0);
-}
-
 SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a07bd8f83546..9ca646534e2b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5211,39 +5211,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
-  case Intrinsic::convertff:
-  case Intrinsic::convertfsi:
-  case Intrinsic::convertfui:
-  case Intrinsic::convertsif:
-  case Intrinsic::convertuif:
-  case Intrinsic::convertss:
-  case Intrinsic::convertsu:
-  case Intrinsic::convertus:
-  case Intrinsic::convertuu: {
-    ISD::CvtCode Code = ISD::CVT_INVALID;
-    switch (Intrinsic) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
-    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
-    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
-    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
-    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
-    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
-    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
-    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
-    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
-    }
-    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    const Value *Op1 = I.getArgOperand(0);
-    Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
-                               DAG.getValueType(DestVT),
-                               DAG.getValueType(getValue(Op1).getValueType()),
-                               getValue(I.getArgOperand(1)),
-                               getValue(I.getArgOperand(2)),
-                               Code);
-    setValue(&I, Res);
-    return nullptr;
-  }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 340088a5fc96..0faaad8a21b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -262,21 +262,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
 
-  case ISD::CONVERT_RNDSAT: {
-    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
-    default: llvm_unreachable("Unknown cvt code!");
-    case ISD::CVT_FF:                   return "cvt_ff";
-    case ISD::CVT_FS:                   return "cvt_fs";
-    case ISD::CVT_FU:                   return "cvt_fu";
-    case ISD::CVT_SF:                   return "cvt_sf";
-    case ISD::CVT_UF:                   return "cvt_uf";
-    case ISD::CVT_SS:                   return "cvt_ss";
-    case ISD::CVT_SU:                   return "cvt_su";
-    case ISD::CVT_US:                   return "cvt_us";
-    case ISD::CVT_UU:                   return "cvt_uu";
-    }
-  }
-
     // Control flow instructions
   case ISD::BR:                         return "br";
   case ISD::BRIND:                      return "brind";
@@ -322,7 +307,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
-    
+
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
   case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 591a37d600cc..690f0d2c8082 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3706,7 +3706,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
   return Result;
 }
 
-SDValue 
+SDValue
 TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
                                        const SDLoc &DL, EVT DataVT,
                                        SelectionDAG &DAG,
@@ -3738,6 +3738,49 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
   return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
 }
 
+static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
+                                       SDValue Idx,
+                                       EVT VecVT,
+                                       const SDLoc &dl) {
+  if (isa<ConstantSDNode>(Idx))
+    return Idx;
+
+  EVT IdxVT = Idx.getValueType();
+  unsigned NElts = VecVT.getVectorNumElements();
+  if (isPowerOf2_32(NElts)) {
+    APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
+                                     Log2_32(NElts));
+    return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+                       DAG.getConstant(Imm, dl, IdxVT));
+  }
+
+  return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
+                     DAG.getConstant(NElts - 1, dl, IdxVT));
+}
+
+SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
+                                                SDValue VecPtr, EVT VecVT,
+                                                SDValue Index) const {
+  SDLoc dl(Index);
+  // Make sure the index type is big enough to compute in.
+  Index = DAG.getZExtOrTrunc(Index, dl, getPointerTy(DAG.getDataLayout()));
+
+  EVT EltVT = VecVT.getVectorElementType();
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getSizeInBits() &&
+         "Converting bits to bytes lost precision");
+
+  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
+
+  EVT IdxVT = Index.getValueType();
+
+  Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
+                      DAG.getConstant(EltSize, dl, IdxVT));
+  return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr);
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation of Emulated TLS Model
 //===----------------------------------------------------------------------===//
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 221a8969965d..f9bff86b41c8 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMDebugInfoCodeView
   CodeViewError.cpp
   CodeViewRecordIO.cpp
   CVSymbolVisitor.cpp
+  CVTypeDumper.cpp
   CVTypeVisitor.cpp
   EnumTables.cpp
   Line.cpp
@@ -10,7 +11,9 @@ add_llvm_library(LLVMDebugInfoCodeView
   RecordSerialization.cpp
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
-  TypeDumper.cpp
+  TypeDatabase.cpp
+  TypeDatabaseVisitor.cpp
+  TypeDumpVisitor.cpp
   TypeRecord.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
new file mode 100644
index 000000000000..fcd239cce0dd
--- /dev/null
+++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
@@ -0,0 +1,73 @@
+//===-- CVTypeDumper.cpp - CodeView type info dumper ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/MSF/ByteStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
+  TypeDatabaseVisitor DBV(TypeDB);
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DBV);
+  Pipeline.addCallbackToPipeline(Dumper);
+
+  CVTypeVisitor Visitor(Pipeline);
+
+  CVType RecordCopy = Record;
+  if (auto EC = Visitor.visitTypeRecord(RecordCopy))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(const CVTypeArray &Types,
+                         TypeVisitorCallbacks &Dumper) {
+  TypeDatabaseVisitor DBV(TypeDB);
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DBV);
+  Pipeline.addCallbackToPipeline(Dumper);
+
+  CVTypeVisitor Visitor(Pipeline);
+
+  if (auto EC = Visitor.visitTypeStream(Types))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
+  msf::ByteStream Stream(Data);
+  CVTypeArray Types;
+  msf::StreamReader Reader(Stream);
+  if (auto EC = Reader.readArray(Types, Reader.getLength()))
+    return EC;
+
+  return dump(Types, Dumper);
+}
+
+void CVTypeDumper::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
+                                  TypeIndex TI, TypeDatabase &DB) {
+  StringRef TypeName;
+  if (!TI.isNoneType())
+    TypeName = DB.getTypeName(TI);
+  if (!TypeName.empty())
+    Printer.printHex(FieldName, TypeName, TI.getIndex());
+  else
+    Printer.printHex(FieldName, TI.getIndex());
+}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 326e1f5add65..fd54fba13c76 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -11,13 +11,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -32,9 +32,9 @@ namespace {
 /// the visitor out of SymbolDumper.h.
 class CVSymbolDumperImpl : public SymbolVisitorCallbacks {
 public:
-  CVSymbolDumperImpl(CVTypeDumper &CVTD, SymbolDumpDelegate *ObjDelegate,
+  CVSymbolDumperImpl(TypeDatabase &TypeDB, SymbolDumpDelegate *ObjDelegate,
                      ScopedPrinter &W, bool PrintRecordBytes)
-      : CVTD(CVTD), ObjDelegate(ObjDelegate), W(W),
+      : TypeDB(TypeDB), ObjDelegate(ObjDelegate), W(W),
         PrintRecordBytes(PrintRecordBytes), InFunctionScope(false) {}
 
 /// CVSymbolVisitor overrides.
@@ -51,8 +51,9 @@ private:
   void printLocalVariableAddrRange(const LocalVariableAddrRange &Range,
                                    uint32_t RelocationOffset);
   void printLocalVariableAddrGap(ArrayRef<LocalVariableAddrGap> Gaps);
+  void printTypeIndex(StringRef FieldName, TypeIndex TI);
 
-  CVTypeDumper &CVTD;
+  TypeDatabase &TypeDB;
   SymbolDumpDelegate *ObjDelegate;
   ScopedPrinter &W;
 
@@ -80,6 +81,10 @@ void CVSymbolDumperImpl::printLocalVariableAddrGap(
   }
 }
 
+void CVSymbolDumperImpl::printTypeIndex(StringRef FieldName, TypeIndex TI) {
+  CVTypeDumper::printTypeIndex(W, FieldName, TI, TypeDB);
+}
+
 Error CVSymbolDumperImpl::visitSymbolBegin(CVSymbol &CVR) {
   return Error::success();
 }
@@ -163,7 +168,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   DictScope S(W, "BPRelativeSym");
 
   W.printNumber("Offset", BPRel.Offset);
-  CVTD.printTypeIndex("Type", BPRel.Type);
+  printTypeIndex("Type", BPRel.Type);
   W.printString("VarName", BPRel.Name);
   return Error::success();
 }
@@ -187,7 +192,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                      CallSiteInfo.CodeOffset, &LinkageName);
   }
   W.printHex("Segment", CallSiteInfo.Segment);
-  CVTD.printTypeIndex("Type", CallSiteInfo.Type);
+  printTypeIndex("Type", CallSiteInfo.Type);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
   return Error::success();
@@ -278,7 +283,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            ConstantSym &Constant) {
   DictScope S(W, "Constant");
 
-  CVTD.printTypeIndex("Type", Constant.Type);
+  printTypeIndex("Type", Constant.Type);
   W.printNumber("Value", Constant.Value);
   W.printString("Name", Constant.Name);
   return Error::success();
@@ -293,7 +298,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, DataSym &Data) {
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
                                      Data.DataOffset, &LinkageName);
   }
-  CVTD.printTypeIndex("Type", Data.Type);
+  printTypeIndex("Type", Data.Type);
   W.printString("DisplayName", Data.Name);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
@@ -445,7 +450,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(
   }
   W.printHex("Segment", HeapAllocSite.Segment);
   W.printHex("CallInstructionSize", HeapAllocSite.CallInstructionSize);
-  CVTD.printTypeIndex("Type", HeapAllocSite.Type);
+  printTypeIndex("Type", HeapAllocSite.Type);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
   return Error::success();
@@ -457,7 +462,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
   W.printHex("PtrParent", InlineSite.Parent);
   W.printHex("PtrEnd", InlineSite.End);
-  CVTD.printTypeIndex("Inlinee", InlineSite.Inlinee);
+  printTypeIndex("Inlinee", InlineSite.Inlinee);
 
   ListScope BinaryAnnotations(W, "BinaryAnnotations");
   for (auto &Annotation : InlineSite.annotations()) {
@@ -555,7 +560,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LabelSym &Label) {
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LocalSym &Local) {
   DictScope S(W, "Local");
 
-  CVTD.printTypeIndex("Type", Local.Type);
+  printTypeIndex("Type", Local.Type);
   W.printFlags("Flags", uint16_t(Local.Flags), getLocalFlagNames());
   W.printString("VarName", Local.Name);
   return Error::success();
@@ -586,7 +591,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
   W.printHex("CodeSize", Proc.CodeSize);
   W.printHex("DbgStart", Proc.DbgStart);
   W.printHex("DbgEnd", Proc.DbgEnd);
-  CVTD.printTypeIndex("FunctionType", Proc.FunctionType);
+  printTypeIndex("FunctionType", Proc.FunctionType);
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset", Proc.getRelocationOffset(),
                                      Proc.CodeOffset, &LinkageName);
@@ -616,7 +621,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, CallerSym &Caller) {
   ListScope S(W, CVR.kind() == S_CALLEES ? "Callees" : "Callers");
   for (auto FuncID : Caller.Indices)
-    CVTD.printTypeIndex("FuncID", FuncID);
+    printTypeIndex("FuncID", FuncID);
   return Error::success();
 }
 
@@ -625,7 +630,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   DictScope S(W, "RegRelativeSym");
 
   W.printHex("Offset", RegRel.Offset);
-  CVTD.printTypeIndex("Type", RegRel.Type);
+  printTypeIndex("Type", RegRel.Type);
   W.printHex("Register", RegRel.Register);
   W.printString("VarName", RegRel.Name);
   return Error::success();
@@ -640,7 +645,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
                                      Data.DataOffset, &LinkageName);
   }
-  CVTD.printTypeIndex("Type", Data.Type);
+  printTypeIndex("Type", Data.Type);
   W.printString("DisplayName", Data.Name);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
@@ -649,7 +654,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, UDTSym &UDT) {
   DictScope S(W, "UDT");
-  CVTD.printTypeIndex("Type", UDT.Type);
+  printTypeIndex("Type", UDT.Type);
   W.printString("UDTName", UDT.Name);
   return Error::success();
 }
@@ -664,7 +669,7 @@ Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) {
 Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
@@ -675,7 +680,7 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
 Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
new file mode 100644
index 000000000000..c7f72551dc8b
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -0,0 +1,114 @@
+//===- TypeDatabase.cpp --------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+struct SimpleTypeEntry {
+  StringRef Name;
+  SimpleTypeKind Kind;
+};
+}
+
+/// The names here all end in "*". If the simple type is a pointer type, we
+/// return the whole name. Otherwise we lop off the last character in our
+/// StringRef.
+static const SimpleTypeEntry SimpleTypeNames[] = {
+    {"void*", SimpleTypeKind::Void},
+    {"<not translated>*", SimpleTypeKind::NotTranslated},
+    {"HRESULT*", SimpleTypeKind::HResult},
+    {"signed char*", SimpleTypeKind::SignedCharacter},
+    {"unsigned char*", SimpleTypeKind::UnsignedCharacter},
+    {"char*", SimpleTypeKind::NarrowCharacter},
+    {"wchar_t*", SimpleTypeKind::WideCharacter},
+    {"char16_t*", SimpleTypeKind::Character16},
+    {"char32_t*", SimpleTypeKind::Character32},
+    {"__int8*", SimpleTypeKind::SByte},
+    {"unsigned __int8*", SimpleTypeKind::Byte},
+    {"short*", SimpleTypeKind::Int16Short},
+    {"unsigned short*", SimpleTypeKind::UInt16Short},
+    {"__int16*", SimpleTypeKind::Int16},
+    {"unsigned __int16*", SimpleTypeKind::UInt16},
+    {"long*", SimpleTypeKind::Int32Long},
+    {"unsigned long*", SimpleTypeKind::UInt32Long},
+    {"int*", SimpleTypeKind::Int32},
+    {"unsigned*", SimpleTypeKind::UInt32},
+    {"__int64*", SimpleTypeKind::Int64Quad},
+    {"unsigned __int64*", SimpleTypeKind::UInt64Quad},
+    {"__int64*", SimpleTypeKind::Int64},
+    {"unsigned __int64*", SimpleTypeKind::UInt64},
+    {"__int128*", SimpleTypeKind::Int128},
+    {"unsigned __int128*", SimpleTypeKind::UInt128},
+    {"__half*", SimpleTypeKind::Float16},
+    {"float*", SimpleTypeKind::Float32},
+    {"float*", SimpleTypeKind::Float32PartialPrecision},
+    {"__float48*", SimpleTypeKind::Float48},
+    {"double*", SimpleTypeKind::Float64},
+    {"long double*", SimpleTypeKind::Float80},
+    {"__float128*", SimpleTypeKind::Float128},
+    {"_Complex float*", SimpleTypeKind::Complex32},
+    {"_Complex double*", SimpleTypeKind::Complex64},
+    {"_Complex long double*", SimpleTypeKind::Complex80},
+    {"_Complex __float128*", SimpleTypeKind::Complex128},
+    {"bool*", SimpleTypeKind::Boolean8},
+    {"__bool16*", SimpleTypeKind::Boolean16},
+    {"__bool32*", SimpleTypeKind::Boolean32},
+    {"__bool64*", SimpleTypeKind::Boolean64},
+};
+
+/// Gets the type index for the next type record.
+TypeIndex TypeDatabase::getNextTypeIndex() const {
+  return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+}
+
+/// Records the name of a type, and reserves its type index.
+void TypeDatabase::recordType(StringRef Name, CVType Data) {
+  CVUDTNames.push_back(Name);
+  TypeRecords.push_back(Data);
+}
+
+/// Saves the name in a StringSet and creates a stable StringRef.
+StringRef TypeDatabase::saveTypeName(StringRef TypeName) {
+  return TypeNameStorage.save(TypeName);
+}
+
+StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
+  if (Index.isNoneType())
+    return "<no type>";
+
+  if (Index.isSimple()) {
+    // This is a simple type.
+    for (const auto &SimpleTypeName : SimpleTypeNames) {
+      if (SimpleTypeName.Kind == Index.getSimpleKind()) {
+        if (Index.getSimpleMode() == SimpleTypeMode::Direct)
+          return SimpleTypeName.Name.drop_back(1);
+        // Otherwise, this is a pointer type. We gloss over the distinction
+        // between near, far, 64, 32, etc, and just give a pointer type.
+        return SimpleTypeName.Name;
+      }
+    }
+    return "<unknown simple type>";
+  }
+
+  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  if (I < CVUDTNames.size())
+    return CVUDTNames[I];
+
+  return "<unknown UDT>";
+}
+
+bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
+  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  return I < CVUDTNames.size();
+}
+
+uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
new file mode 100644
index 000000000000..d9d563902182
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -0,0 +1,289 @@
+//===- TypeDatabaseVisitor.cpp -------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+using namespace llvm::codeview;
+
+Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+  assert(!IsInFieldList);
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+
+  if (Record.Type == LF_FIELDLIST) {
+    // Record that we're in a field list so that members do not get assigned
+    // type indices.
+    IsInFieldList = true;
+  }
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
+  if (CVR.Type == LF_FIELDLIST) {
+    assert(IsInFieldList);
+    IsInFieldList = false;
+  }
+  assert(!IsInFieldList);
+
+  // Record every type that is not a field list member, even if Name is empty.
+  // CVUDTNames is indexed by type index, and must have one entry for every
+  // type.  Field list members are not recorded, and are only referenced by
+  // their containing field list record.
+  TypeDB.recordType(Name, CVR);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitMemberBegin(CVMemberRecord &Record) {
+  assert(IsInFieldList);
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitMemberEnd(CVMemberRecord &Record) {
+  assert(IsInFieldList);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            FieldListRecord &FieldList) {
+  Name = "<field list>";
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
+                                            StringIdRecord &String) {
+  // Put this in the database so it gets printed with LF_UDT_SRC_LINE.
+  Name = String.getString();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
+  auto Indices = Args.getIndices();
+  uint32_t Size = Indices.size();
+  SmallString<256> TypeName("(");
+  for (uint32_t I = 0; I < Size; ++I) {
+    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    TypeName.append(ArgTypeName);
+    if (I + 1 != Size)
+      TypeName.append(", ");
+  }
+  TypeName.push_back(')');
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
+  Name = Class.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
+  Name = Union.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
+  Name = AT.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
+  Name = VFT.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            MemberFuncIdRecord &Id) {
+  Name = Id.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            ProcedureRecord &Proc) {
+  StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
+  StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ArgListTypeName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            MemberFunctionRecord &MF) {
+  StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
+  StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
+  StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ClassTypeName);
+  TypeName.append("::");
+  TypeName.append(ArgListTypeName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
+  Name = Func.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            TypeServer2Record &TS) {
+  Name = TS.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
+
+  if (Ptr.isPointerToMember()) {
+    const MemberPointerInfo &MI = Ptr.getMemberInfo();
+
+    StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
+    StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+    SmallString<256> TypeName(PointeeName);
+    TypeName.push_back(' ');
+    TypeName.append(ClassName);
+    TypeName.append("::*");
+    Name = TypeDB.saveTypeName(TypeName);
+  } else {
+    SmallString<256> TypeName;
+    if (Ptr.isConst())
+      TypeName.append("const ");
+    if (Ptr.isVolatile())
+      TypeName.append("volatile ");
+    if (Ptr.isUnaligned())
+      TypeName.append("__unaligned ");
+
+    TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+
+    if (Ptr.getMode() == PointerMode::LValueReference)
+      TypeName.append("&");
+    else if (Ptr.getMode() == PointerMode::RValueReference)
+      TypeName.append("&&");
+    else if (Ptr.getMode() == PointerMode::Pointer)
+      TypeName.append("*");
+
+    if (!TypeName.empty())
+      Name = TypeDB.saveTypeName(TypeName);
+  }
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
+  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
+
+  StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+  SmallString<256> TypeName;
+  if (Mods & uint16_t(ModifierOptions::Const))
+    TypeName.append("const ");
+  if (Mods & uint16_t(ModifierOptions::Volatile))
+    TypeName.append("volatile ");
+  if (Mods & uint16_t(ModifierOptions::Unaligned))
+    TypeName.append("__unaligned ");
+  TypeName.append(ModifiedName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            VFTableShapeRecord &Shape) {
+  Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
+                             " methods>");
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            NestedTypeRecord &Nested) {
+  Name = Nested.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            OneMethodRecord &Method) {
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            OverloadedMethodRecord &Method) {
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            DataMemberRecord &Field) {
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            StaticDataMemberRecord &Field) {
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            EnumeratorRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            BaseClassRecord &Base) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            VirtualBaseClassRecord &VBase) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            ListContinuationRecord &Cont) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(
+    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            UdtSourceLineRecord &SourceLine) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(
+    CVType &CVR, MethodOverloadListRecord &Overloads) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            VFPtrRecord &VFP) {
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDumper.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 4274d834076a..033585ba8cc9 100644
--- a/lib/DebugInfo/CodeView/TypeDumper.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -1,4 +1,5 @@
-//===-- TypeDumper.cpp - CodeView type info dumper --------------*- C++ -*-===//
+//===-- TypeDumpVisitor.cpp - CodeView type info dumper -----------*- C++
+//-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
+
 #include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -20,52 +25,6 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-/// The names here all end in "*". If the simple type is a pointer type, we
-/// return the whole name. Otherwise we lop off the last character in our
-/// StringRef.
-static const EnumEntry<SimpleTypeKind> SimpleTypeNames[] = {
-    {"void*", SimpleTypeKind::Void},
-    {"<not translated>*", SimpleTypeKind::NotTranslated},
-    {"HRESULT*", SimpleTypeKind::HResult},
-    {"signed char*", SimpleTypeKind::SignedCharacter},
-    {"unsigned char*", SimpleTypeKind::UnsignedCharacter},
-    {"char*", SimpleTypeKind::NarrowCharacter},
-    {"wchar_t*", SimpleTypeKind::WideCharacter},
-    {"char16_t*", SimpleTypeKind::Character16},
-    {"char32_t*", SimpleTypeKind::Character32},
-    {"__int8*", SimpleTypeKind::SByte},
-    {"unsigned __int8*", SimpleTypeKind::Byte},
-    {"short*", SimpleTypeKind::Int16Short},
-    {"unsigned short*", SimpleTypeKind::UInt16Short},
-    {"__int16*", SimpleTypeKind::Int16},
-    {"unsigned __int16*", SimpleTypeKind::UInt16},
-    {"long*", SimpleTypeKind::Int32Long},
-    {"unsigned long*", SimpleTypeKind::UInt32Long},
-    {"int*", SimpleTypeKind::Int32},
-    {"unsigned*", SimpleTypeKind::UInt32},
-    {"__int64*", SimpleTypeKind::Int64Quad},
-    {"unsigned __int64*", SimpleTypeKind::UInt64Quad},
-    {"__int64*", SimpleTypeKind::Int64},
-    {"unsigned __int64*", SimpleTypeKind::UInt64},
-    {"__int128*", SimpleTypeKind::Int128},
-    {"unsigned __int128*", SimpleTypeKind::UInt128},
-    {"__half*", SimpleTypeKind::Float16},
-    {"float*", SimpleTypeKind::Float32},
-    {"float*", SimpleTypeKind::Float32PartialPrecision},
-    {"__float48*", SimpleTypeKind::Float48},
-    {"double*", SimpleTypeKind::Float64},
-    {"long double*", SimpleTypeKind::Float80},
-    {"__float128*", SimpleTypeKind::Float128},
-    {"_Complex float*", SimpleTypeKind::Complex32},
-    {"_Complex double*", SimpleTypeKind::Complex64},
-    {"_Complex long double*", SimpleTypeKind::Complex80},
-    {"_Complex __float128*", SimpleTypeKind::Complex128},
-    {"bool*", SimpleTypeKind::Boolean8},
-    {"__bool16*", SimpleTypeKind::Boolean16},
-    {"__bool32*", SimpleTypeKind::Boolean32},
-    {"__bool64*", SimpleTypeKind::Boolean64},
-};
-
 static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
 #define CV_TYPE(enum, val) {#enum, enum},
 #include "llvm/DebugInfo/CodeView/TypeRecords.def"
@@ -90,10 +49,8 @@ static const EnumEntry<uint16_t> ClassOptionNames[] = {
 };
 
 static const EnumEntry<uint8_t> MemberAccessNames[] = {
-    ENUM_ENTRY(MemberAccess, None),
-    ENUM_ENTRY(MemberAccess, Private),
-    ENUM_ENTRY(MemberAccess, Protected),
-    ENUM_ENTRY(MemberAccess, Public),
+    ENUM_ENTRY(MemberAccess, None), ENUM_ENTRY(MemberAccess, Private),
+    ENUM_ENTRY(MemberAccess, Protected), ENUM_ENTRY(MemberAccess, Public),
 };
 
 static const EnumEntry<uint16_t> MethodOptionNames[] = {
@@ -151,8 +108,7 @@ static const EnumEntry<uint16_t> PtrMemberRepNames[] = {
 };
 
 static const EnumEntry<uint16_t> TypeModifierNames[] = {
-    ENUM_ENTRY(ModifierOptions, Const),
-    ENUM_ENTRY(ModifierOptions, Volatile),
+    ENUM_ENTRY(ModifierOptions, Const), ENUM_ENTRY(ModifierOptions, Volatile),
     ENUM_ENTRY(ModifierOptions, Unaligned),
 };
 
@@ -203,38 +159,22 @@ static StringRef getLeafTypeName(TypeLeafKind LT) {
   return "UnknownLeaf";
 }
 
-Error CVTypeDumper::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
-  assert(!IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
+void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
+  CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
+}
 
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " (" << HexNumber(getNextTypeIndex()) << ")";
+  W->getOStream() << " (" << HexNumber(TypeDB.getNextTypeIndex().getIndex())
+                  << ")";
   W->getOStream() << " {\n";
   W->indent();
   W->printEnum("TypeLeafKind", unsigned(Record.Type),
                makeArrayRef(LeafTypeNames));
-  if (Record.Type == LF_FIELDLIST) {
-    // Record that we're in a field list so that members do not get assigned
-    // type indices.
-    IsInFieldList = true;
-  }
   return Error::success();
 }
 
-Error CVTypeDumper::visitTypeEnd(CVRecord<TypeLeafKind> &Record) {
-  if (Record.Type == LF_FIELDLIST) {
-    assert(IsInFieldList);
-    IsInFieldList = false;
-  }
-  assert(!IsInFieldList);
-
-  // Record every type that is not a field list member, even if Name is empty.
-  // CVUDTNames is indexed by type index, and must have one entry for every
-  // type.  Field list members are not recorded, and are only referenced by
-  // their containing field list record.
-  recordType(Name);
-
+Error TypeDumpVisitor::visitTypeEnd(CVType &Record) {
   if (PrintRecordBytes)
     W->printBinaryBlock("LeafData", getBytesAsCharacters(Record.content()));
 
@@ -243,11 +183,7 @@ Error CVTypeDumper::visitTypeEnd(CVRecord<TypeLeafKind> &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitMemberBegin(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-
+Error TypeDumpVisitor::visitMemberBegin(CVMemberRecord &Record) {
   W->startLine() << getLeafTypeName(Record.Kind);
   W->getOStream() << " {\n";
   W->indent();
@@ -256,8 +192,7 @@ Error CVTypeDumper::visitMemberBegin(CVMemberRecord &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitMemberEnd(CVMemberRecord &Record) {
-  assert(IsInFieldList);
+Error TypeDumpVisitor::visitMemberEnd(CVMemberRecord &Record) {
   if (PrintRecordBytes)
     W->printBinaryBlock("LeafData", getBytesAsCharacters(Record.Data));
 
@@ -266,46 +201,33 @@ Error CVTypeDumper::visitMemberEnd(CVMemberRecord &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     FieldListRecord &FieldList) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        FieldListRecord &FieldList) {
   CVTypeVisitor Visitor(*this);
   if (auto EC = Visitor.visitFieldListMemberStream(FieldList.Data))
     return EC;
 
-  Name = "<field list>";
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     StringIdRecord &String) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringIdRecord &String) {
   printTypeIndex("Id", String.getId());
   W->printString("StringData", String.getString());
-  // Put this in CVUDTNames so it gets printed with LF_UDT_SRC_LINE.
-  Name = String.getString();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ArgListRecord &Args) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   auto Indices = Args.getIndices();
   uint32_t Size = Indices.size();
   W->printNumber("NumArgs", Size);
   ListScope Arguments(*W, "Arguments");
-  SmallString<256> TypeName("(");
   for (uint32_t I = 0; I < Size; ++I) {
     printTypeIndex("ArgType", Indices[I]);
-    StringRef ArgTypeName = getTypeName(Indices[I]);
-    TypeName.append(ArgTypeName);
-    if (I + 1 != Size)
-      TypeName.append(", ");
   }
-  TypeName.push_back(')');
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ClassRecord &Class) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   uint16_t Props = static_cast<uint16_t>(Class.getOptions());
   W->printNumber("MemberCount", Class.getMemberCount());
   W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
@@ -316,12 +238,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Class.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Class.getUniqueName());
-  Name = Class.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UnionRecord &Union) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
   uint16_t Props = static_cast<uint16_t>(Union.getOptions());
   W->printNumber("MemberCount", Union.getMemberCount());
   W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
@@ -330,12 +250,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Union.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Union.getUniqueName());
-  Name = Union.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     EnumRecord &Enum) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
   uint16_t Props = static_cast<uint16_t>(Enum.getOptions());
   W->printNumber("NumEnumerators", Enum.getMemberCount());
   W->printFlags("Properties", uint16_t(Enum.getOptions()),
@@ -345,43 +263,35 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Enum.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Enum.getUniqueName());
-  Name = Enum.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ArrayRecord &AT) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
   printTypeIndex("ElementType", AT.getElementType());
   printTypeIndex("IndexType", AT.getIndexType());
   W->printNumber("SizeOf", AT.getSize());
   W->printString("Name", AT.getName());
-  Name = AT.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     VFTableRecord &VFT) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
   printTypeIndex("CompleteClass", VFT.getCompleteClass());
   printTypeIndex("OverriddenVFTable", VFT.getOverriddenVTable());
   W->printHex("VFPtrOffset", VFT.getVFPtrOffset());
   W->printString("VFTableName", VFT.getName());
   for (auto N : VFT.getMethodNames())
     W->printString("MethodName", N);
-  Name = VFT.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MemberFuncIdRecord &Id) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, MemberFuncIdRecord &Id) {
   printTypeIndex("ClassType", Id.getClassType());
   printTypeIndex("FunctionType", Id.getFunctionType());
   W->printString("Name", Id.getName());
-  Name = Id.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ProcedureRecord &Proc) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ProcedureRecord &Proc) {
   printTypeIndex("ReturnType", Proc.getReturnType());
   W->printEnum("CallingConvention", uint8_t(Proc.getCallConv()),
                makeArrayRef(CallingConventions));
@@ -389,18 +299,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
                 makeArrayRef(FunctionOptionEnum));
   W->printNumber("NumParameters", Proc.getParameterCount());
   printTypeIndex("ArgListType", Proc.getArgumentList());
-
-  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ArgListTypeName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MemberFunctionRecord &MF) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, MemberFunctionRecord &MF) {
   printTypeIndex("ReturnType", MF.getReturnType());
   printTypeIndex("ClassType", MF.getClassType());
   printTypeIndex("ThisType", MF.getThisType());
@@ -411,21 +313,11 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printNumber("NumParameters", MF.getParameterCount());
   printTypeIndex("ArgListType", MF.getArgumentList());
   W->printNumber("ThisAdjustment", MF.getThisPointerAdjustment());
-
-  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ClassTypeName);
-  TypeName.append("::");
-  TypeName.append(ArgListTypeName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MethodOverloadListRecord &MethodList) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        MethodOverloadListRecord &MethodList) {
   for (auto &M : MethodList.getMethods()) {
     ListScope S(*W, "Method");
     printMemberAttributes(M.getAccess(), M.getMethodKind(), M.getOptions());
@@ -436,26 +328,21 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     FuncIdRecord &Func) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
   printTypeIndex("ParentScope", Func.getParentScope());
   printTypeIndex("FunctionType", Func.getFunctionType());
   W->printString("Name", Func.getName());
-  Name = Func.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     TypeServer2Record &TS) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
   W->printBinary("Signature", TS.getGuid());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
-  Name = TS.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     PointerRecord &Ptr) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   printTypeIndex("PointeeType", Ptr.getReferentType());
   W->printHex("PointerAttributes", uint32_t(Ptr.getOptions()));
   W->printEnum("PtrType", unsigned(Ptr.getPointerKind()),
@@ -474,82 +361,42 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
     printTypeIndex("ClassType", MI.getContainingType());
     W->printEnum("Representation", uint16_t(MI.getRepresentation()),
                  makeArrayRef(PtrMemberRepNames));
-
-    StringRef PointeeName = getTypeName(Ptr.getReferentType());
-    StringRef ClassName = getTypeName(MI.getContainingType());
-    SmallString<256> TypeName(PointeeName);
-    TypeName.push_back(' ');
-    TypeName.append(ClassName);
-    TypeName.append("::*");
-    Name = saveName(TypeName);
-  } else {
-    SmallString<256> TypeName;
-    if (Ptr.isConst())
-      TypeName.append("const ");
-    if (Ptr.isVolatile())
-      TypeName.append("volatile ");
-    if (Ptr.isUnaligned())
-      TypeName.append("__unaligned ");
-
-    TypeName.append(getTypeName(Ptr.getReferentType()));
-
-    if (Ptr.getMode() == PointerMode::LValueReference)
-      TypeName.append("&");
-    else if (Ptr.getMode() == PointerMode::RValueReference)
-      TypeName.append("&&");
-    else if (Ptr.getMode() == PointerMode::Pointer)
-      TypeName.append("*");
-
-    if (!TypeName.empty())
-      Name = saveName(TypeName);
   }
+
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ModifierRecord &Mod) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
   printTypeIndex("ModifiedType", Mod.getModifiedType());
   W->printFlags("Modifiers", Mods, makeArrayRef(TypeModifierNames));
 
-  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
-  SmallString<256> TypeName;
-  if (Mods & uint16_t(ModifierOptions::Const))
-    TypeName.append("const ");
-  if (Mods & uint16_t(ModifierOptions::Volatile))
-    TypeName.append("volatile ");
-  if (Mods & uint16_t(ModifierOptions::Unaligned))
-    TypeName.append("__unaligned ");
-  TypeName.append(ModifiedName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     BitFieldRecord &BitField) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BitField) {
   printTypeIndex("Type", BitField.getType());
   W->printNumber("BitSize", BitField.getBitSize());
   W->printNumber("BitOffset", BitField.getBitOffset());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     VFTableShapeRecord &Shape) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        VFTableShapeRecord &Shape) {
   W->printNumber("VFEntryCount", Shape.getEntryCount());
-  Name = saveName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UdtSourceLineRecord &Line) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        UdtSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
   printTypeIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UdtModSourceLineRecord &Line) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        UdtModSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
   printTypeIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
@@ -557,8 +404,7 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     BuildInfoRecord &Args) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &Args) {
   W->printNumber("NumArgs", static_cast<uint32_t>(Args.getArgs().size()));
 
   ListScope Arguments(*W, "Arguments");
@@ -568,13 +414,14 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-void CVTypeDumper::printMemberAttributes(MemberAttributes Attrs) {
+void TypeDumpVisitor::printMemberAttributes(MemberAttributes Attrs) {
   return printMemberAttributes(Attrs.getAccess(), Attrs.getMethodKind(),
                                Attrs.getFlags());
 }
 
-void CVTypeDumper::printMemberAttributes(MemberAccess Access, MethodKind Kind,
-                                         MethodOptions Options) {
+void TypeDumpVisitor::printMemberAttributes(MemberAccess Access,
+                                            MethodKind Kind,
+                                            MethodOptions Options) {
   W->printEnum("AccessSpecifier", uint8_t(Access),
                makeArrayRef(MemberAccessNames));
   // Data members will be vanilla. Don't try to print a method kind for them.
@@ -586,27 +433,26 @@ void CVTypeDumper::printMemberAttributes(MemberAccess Access, MethodKind Kind,
   }
 }
 
-Error CVTypeDumper::visitUnknownMember(CVMemberRecord &Record) {
+Error TypeDumpVisitor::visitUnknownMember(CVMemberRecord &Record) {
   W->printHex("UnknownMember", unsigned(Record.Kind));
   return Error::success();
 }
 
-Error CVTypeDumper::visitUnknownType(CVRecord<TypeLeafKind> &Record) {
+Error TypeDumpVisitor::visitUnknownType(CVType &Record) {
   W->printEnum("Kind", uint16_t(Record.kind()), makeArrayRef(LeafTypeNames));
   W->printNumber("Length", uint32_t(Record.content().size()));
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     NestedTypeRecord &Nested) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        NestedTypeRecord &Nested) {
   printTypeIndex("Type", Nested.getNestedType());
   W->printString("Name", Nested.getName());
-  Name = Nested.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     OneMethodRecord &Method) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        OneMethodRecord &Method) {
   MethodKind K = Method.getMethodKind();
   printMemberAttributes(Method.getAccess(), K, Method.getOptions());
   printTypeIndex("Type", Method.getType());
@@ -614,58 +460,53 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   if (Method.isIntroducingVirtual())
     W->printHex("VFTableOffset", Method.getVFTableOffset());
   W->printString("Name", Method.getName());
-  Name = Method.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     OverloadedMethodRecord &Method) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        OverloadedMethodRecord &Method) {
   W->printHex("MethodCount", Method.getNumOverloads());
   printTypeIndex("MethodListIndex", Method.getMethodList());
   W->printString("Name", Method.getName());
-  Name = Method.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     DataMemberRecord &Field) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        DataMemberRecord &Field) {
   printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("Type", Field.getType());
   W->printHex("FieldOffset", Field.getFieldOffset());
   W->printString("Name", Field.getName());
-  Name = Field.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     StaticDataMemberRecord &Field) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        StaticDataMemberRecord &Field) {
   printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("Type", Field.getType());
   W->printString("Name", Field.getName());
-  Name = Field.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     VFPtrRecord &VFTable) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        VFPtrRecord &VFTable) {
   printTypeIndex("Type", VFTable.getType());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     EnumeratorRecord &Enum) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        EnumeratorRecord &Enum) {
   printMemberAttributes(Enum.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   W->printNumber("EnumValue", Enum.getValue());
   W->printString("Name", Enum.getName());
-  Name = Enum.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     BaseClassRecord &Base) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        BaseClassRecord &Base) {
   printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("BaseType", Base.getBaseType());
@@ -673,8 +514,8 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     VirtualBaseClassRecord &Base) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        VirtualBaseClassRecord &Base) {
   printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("BaseType", Base.getBaseType());
@@ -684,89 +525,8 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     ListContinuationRecord &Cont) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        ListContinuationRecord &Cont) {
   printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
   return Error::success();
 }
-
-StringRef CVTypeDumper::getTypeName(TypeIndex TI) {
-  if (TI.isNoneType())
-    return "<no type>";
-
-  if (TI.isSimple()) {
-    // This is a simple type.
-    for (const auto &SimpleTypeName : SimpleTypeNames) {
-      if (SimpleTypeName.Value == TI.getSimpleKind()) {
-        if (TI.getSimpleMode() == SimpleTypeMode::Direct)
-          return SimpleTypeName.Name.drop_back(1);
-        // Otherwise, this is a pointer type. We gloss over the distinction
-        // between near, far, 64, 32, etc, and just give a pointer type.
-        return SimpleTypeName.Name;
-      }
-    }
-    return "<unknown simple type>";
-  }
-
-  // User-defined type.
-  StringRef UDTName;
-  unsigned UDTIndex = TI.getIndex() - 0x1000;
-  if (UDTIndex < CVUDTNames.size())
-    return CVUDTNames[UDTIndex];
-
-  return "<unknown UDT>";
-}
-
-void CVTypeDumper::printTypeIndex(StringRef FieldName, TypeIndex TI) {
-  StringRef TypeName;
-  if (!TI.isNoneType())
-    TypeName = getTypeName(TI);
-  if (!TypeName.empty())
-    W->printHex(FieldName, TypeName, TI.getIndex());
-  else
-    W->printHex(FieldName, TI.getIndex());
-}
-
-Error CVTypeDumper::dump(const CVRecord<TypeLeafKind> &Record) {
-  assert(W && "printer should not be null");
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(*this);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  CVRecord<TypeLeafKind> RecordCopy = Record;
-  if (auto EC = Visitor.visitTypeRecord(RecordCopy))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(const CVTypeArray &Types) {
-  assert(W && "printer should not be null");
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(*this);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  if (auto EC = Visitor.visitTypeStream(Types))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(ArrayRef<uint8_t> Data) {
-  msf::ByteStream Stream(Data);
-  CVTypeArray Types;
-  msf::StreamReader Reader(Stream);
-  if (auto EC = Reader.readArray(Types, Reader.getLength()))
-    return EC;
-
-  return dump(Types);
-}
-
-void CVTypeDumper::setPrinter(ScopedPrinter *P) {
-  static ScopedPrinter NullP(llvm::nulls());
-  W = P ? P : &NullP;
-}
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 6126470aa099..08bc74a81e9a 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -56,13 +56,20 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
     auto A = static_cast<Attribute>(Data.getULEB128(OffsetPtr));
     auto F = static_cast<Form>(Data.getULEB128(OffsetPtr));
     if (A && F) {
-      auto FixedFormByteSize = DWARFFormValue::getFixedByteSize(F);
-      AttributeSpecs.push_back(AttributeSpec(A, F, FixedFormByteSize));
+      Optional<int64_t> V;
+      bool IsImplicitConst = (F == DW_FORM_implicit_const);
+      if (IsImplicitConst)
+        V = Data.getSLEB128(OffsetPtr);
+      else if (auto Size = DWARFFormValue::getFixedByteSize(F))
+        V = *Size;
+      AttributeSpecs.push_back(AttributeSpec(A, F, V));
+      if (IsImplicitConst)
+        continue;
       // If this abbrevation still has a fixed byte size, then update the
       // FixedAttributeSize as needed.
       if (FixedAttributeSize) {
-        if (FixedFormByteSize)
-          FixedAttributeSize->NumBytes += *FixedFormByteSize;
+        if (V)
+          FixedAttributeSize->NumBytes += *V;
         else {
           switch (F) {
           case DW_FORM_addr:
@@ -129,6 +136,8 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
       OS << formString;
     else
       OS << format("DW_FORM_Unknown_%x", Spec.Form);
+    if (Spec.isImplicitConst())
+      OS << '\t' << *Spec.ByteSizeOrValue;
     OS << '\n';
   }
   OS << '\n';
@@ -160,11 +169,15 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     if (*MatchAttrIndex == AttrIndex) {
       // We have arrived at the attribute to extract, extract if from Offset.
       DWARFFormValue FormValue(Spec.Form);
+      if (Spec.isImplicitConst()) {
+        FormValue.setSValue(*Spec.ByteSizeOrValue);
+        return FormValue;
+      }
       if (FormValue.extractValue(DebugInfoData, &Offset, &U))
         return FormValue;
     }
     // March Offset along until we get to the attribute we want.
-    if (Optional<uint8_t> FixedSize = Spec.getByteSize(U))
+    if (auto FixedSize = Spec.getByteSize(U))
       Offset += *FixedSize;
     else
       DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset, &U);
@@ -185,9 +198,17 @@ size_t DWARFAbbreviationDeclaration::FixedSizeInfo::getByteSize(
   return ByteSize;
 }
 
-Optional<uint8_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
+Optional<int64_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
     const DWARFUnit &U) const {
-  return ByteSize ? ByteSize : DWARFFormValue::getFixedByteSize(Form, &U);
+  if (isImplicitConst())
+    return 0;
+  if (ByteSizeOrValue)
+    return ByteSizeOrValue;
+  Optional<int64_t> S;
+  auto FixedByteSize = DWARFFormValue::getFixedByteSize(Form, &U);
+  if (FixedByteSize)
+    S = *FixedByteSize;
+  return S;
 }
 
 Optional<size_t> DWARFAbbreviationDeclaration::getFixedAttributesByteSize(
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 7df66c76e8b5..77f6f65ee131 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/Compression.h"
@@ -577,66 +578,6 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
-static bool consumeCompressedGnuHeader(StringRef &data,
-                                       uint64_t &OriginalSize) {
-  // Consume "ZLIB" prefix.
-  if (!data.startswith("ZLIB"))
-    return false;
-  data = data.substr(4);
-  // Consume uncompressed section size (big-endian 8 bytes).
-  DataExtractor extractor(data, false, 8);
-  uint32_t Offset = 0;
-  OriginalSize = extractor.getU64(&Offset);
-  if (Offset == 0)
-    return false;
-  data = data.substr(Offset);
-  return true;
-}
-
-static bool consumeCompressedZLibHeader(StringRef &Data, uint64_t &OriginalSize,
-                                        bool IsLE, bool Is64Bit) {
-  using namespace ELF;
-  uint64_t HdrSize = Is64Bit ? sizeof(Elf64_Chdr) : sizeof(Elf32_Chdr);
-  if (Data.size() < HdrSize)
-    return false;
-
-  DataExtractor Extractor(Data, IsLE, 0);
-  uint32_t Offset = 0;
-  if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
-                                             : sizeof(Elf32_Word)) !=
-      ELFCOMPRESS_ZLIB)
-    return false;
-
-  // Skip Elf64_Chdr::ch_reserved field.
-  if (Is64Bit)
-    Offset += sizeof(Elf64_Word);
-
-  OriginalSize = Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Xword)
-                                                        : sizeof(Elf32_Word));
-  Data = Data.substr(HdrSize);
-  return true;
-}
-
-static bool tryDecompress(StringRef &Name, StringRef &Data,
-                          SmallString<32> &Out, bool ZLibStyle, bool IsLE,
-                          bool Is64Bit) {
-  if (!zlib::isAvailable())
-    return false;
-
-  uint64_t OriginalSize;
-  bool Result =
-      ZLibStyle ? consumeCompressedZLibHeader(Data, OriginalSize, IsLE, Is64Bit)
-                : consumeCompressedGnuHeader(Data, OriginalSize);
-
-  if (!Result || zlib::uncompress(Data, Out, OriginalSize) != zlib::StatusOK)
-    return false;
-
-  // gnu-style names are started from "z", consume that.
-  if (!ZLibStyle)
-    Name = Name.substr(1);
-  return true;
-}
-
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     const LoadedObjectInfo *L)
     : IsLittleEndian(Obj.isLittleEndian()),
@@ -660,18 +601,23 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     if (!L || !L->getLoadedSectionContents(*RelocatedSection,data))
       Section.getContents(data);
 
-    name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
-
-    bool ZLibStyleCompressed = Section.isCompressed();
-    if (ZLibStyleCompressed || name.startswith("zdebug_")) {
+    if (Decompressor::isCompressed(Section)) {
+      Expected<Decompressor> Decompressor =
+          Decompressor::create(name, data, IsLittleEndian, AddressSize == 8);
+      if (!Decompressor)
+        continue;
       SmallString<32> Out;
-      if (!tryDecompress(name, data, Out, ZLibStyleCompressed, IsLittleEndian,
-                         AddressSize == 8))
+      if (auto Err = Decompressor->decompress(Out))
         continue;
       UncompressedSections.emplace_back(std::move(Out));
       data = UncompressedSections.back();
     }
 
+    // Compressed sections names in GNU style starts from ".z",
+    // at this point section is decompressed and we drop compression prefix.
+    name = name.substr(
+        name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.
+
     StringRef *SectionData =
         StringSwitch<StringRef *>(name)
             .Case("debug_info", &InfoSection.Data)
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 9f623e4954c8..c487e1dca7c6 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -57,7 +57,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
   // Skip all data in the .debug_info for the attributes
   for (const auto &AttrSpec : AbbrevDecl->attributes()) {
     // Check if this attribute has a fixed byte size.
-    if (Optional<uint8_t> FixedSize = AttrSpec.getByteSize(U)) {
+    if (auto FixedSize = AttrSpec.getByteSize(U)) {
       // Attribute byte size if fixed, just add the size to the offset.
       *OffsetPtr += *FixedSize;
     } else if (!DWARFFormValue::skipValue(AttrSpec.Form, DebugInfoData,
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 2aac3474654f..89b83b11ab68 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -152,13 +152,6 @@ const char *DWARFDie::getAttributeValueAsString(dwarf::Attribute Attr,
   return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr,
-                                              uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsAddress(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -166,13 +159,6 @@ DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
   return None;
 }
 
-int64_t DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr,
-                                                    int64_t FailValue) const {
-  if (auto Value = getAttributeValueAsSignedConstant(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<int64_t>
 DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -180,15 +166,6 @@ DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t
-DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr,
-                                              uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsUnsignedConstant(Attr))
-    return *Value;
-  return FailValue;
-}
-
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -196,14 +173,6 @@ DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr,
-                                                uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsReference(Attr))
-    return *Value;
-  return FailValue;
-}
-
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -211,13 +180,6 @@ DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr,
-                                                    uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsSectionOffset(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -345,9 +307,10 @@ DWARFDie::getName(DINameKind Kind) const {
 
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
                               uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsignedConstant(DW_AT_call_column, 0);
+  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file).getValueOr(0);
+  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line).getValueOr(0);
+  CallColumn =
+      getAttributeValueAsUnsignedConstant(DW_AT_call_column).getValueOr(0);
 }
 
 void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index e48a6f0981b7..dc9310dc4e89 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -153,7 +153,7 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
       return 16;
 
     case DW_FORM_implicit_const:
-      // The implicit value is stored in the abbreviation as a ULEB128, any
+      // The implicit value is stored in the abbreviation as a SLEB128, and
       // there no data in debug info.
       return 0;
 
@@ -280,6 +280,8 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   case DW_FORM_GNU_str_index:
   case DW_FORM_GNU_strp_alt:
     return (FC == FC_String);
+  case DW_FORM_implicit_const:
+    return (FC == FC_Constant);
   default:
     break;
   }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 63fb0d3bc368..ee2c569b0bce 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -230,10 +230,12 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
       BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_entry_pc);
     if (BaseAddr)
       setBaseAddress(*BaseAddr);
-    AddrOffsetSectionBase = UnitDie.getAttributeValueAsSectionOffset(
-        DW_AT_GNU_addr_base, 0);
-    RangeSectionBase = UnitDie.getAttributeValueAsSectionOffset(
-        DW_AT_rnglists_base, 0);
+    AddrOffsetSectionBase =
+        UnitDie.getAttributeValueAsSectionOffset(DW_AT_GNU_addr_base)
+            .getValueOr(0);
+    RangeSectionBase =
+        UnitDie.getAttributeValueAsSectionOffset(DW_AT_rnglists_base)
+            .getValueOr(0);
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 8f6b1849169a..05615d3cc6cf 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -374,6 +374,9 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     write(isBE, TargetPtr, static_cast<uint32_t>(Result & 0xffffffffU));
     break;
   }
+  case ELF::R_AARCH64_PREL64:
+    write(isBE, TargetPtr, Value + Addend - FinalAddress);
+    break;
   case ELF::R_AARCH64_CALL26: // fallthrough
   case ELF::R_AARCH64_JUMP26: {
     // Operation: S+A-P. Set Call or B immediate value to bits fff_fffc of the
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index a87b9bec1ed2..e3a7bae02e0a 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -77,6 +77,11 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   switch (Name[0]) {
   default: break;
   case 'a': {
+    if (Name.startswith("arm.rbit") || Name.startswith("aarch64.rbit")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
+                                        F->arg_begin()->getType());
+      return true;
+    }
     if (Name.startswith("arm.neon.vclz")) {
       Type* args[2] = {
         F->arg_begin()->getType(),
@@ -1761,6 +1766,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
+  case Intrinsic::bitreverse:
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 2ea572490b6d..d06161067f5f 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -90,6 +90,20 @@ void DIBuilder::finalize() {
         VMContext, SmallVector<Metadata *, 16>(AllImportedModules.begin(),
                                                AllImportedModules.end())));
 
+  for (const auto &I : AllMacrosPerParent) {
+    // DIMacroNode's with nullptr parent are DICompileUnit direct children.
+    if (!I.first) {
+      CUNode->replaceMacros(MDTuple::get(VMContext, I.second.getArrayRef()));
+      continue;
+    }
+    // Otherwise, it must be a temporary DIMacroFile that need to be resolved.
+    auto *TMF = cast<DIMacroFile>(I.first);
+    auto *MF = DIMacroFile::get(VMContext, dwarf::DW_MACINFO_start_file,
+                                TMF->getLine(), TMF->getFile(),
+                                getOrCreateMacroArray(I.second.getArrayRef()));
+    replaceTemporary(llvm::TempDIMacroNode(TMF), MF);
+  }
+
   // Now that all temp nodes have been replaced or deleted, resolve remaining
   // cycles.
   for (const auto &N : UnresolvedNodes)
@@ -179,6 +193,31 @@ DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
   return DIFile::get(VMContext, Filename, Directory, CSKind, Checksum);
 }
 
+DIMacro *DIBuilder::createMacro(DIMacroFile *Parent, unsigned LineNumber,
+                                unsigned MacroType, StringRef Name,
+                                StringRef Value) {
+  assert(!Name.empty() && "Unable to create macro without name");
+  assert((MacroType == dwarf::DW_MACINFO_undef ||
+          MacroType == dwarf::DW_MACINFO_define) &&
+         "Unexpected macro type");
+  auto *M = DIMacro::get(VMContext, MacroType, LineNumber, Name, Value);
+  AllMacrosPerParent[Parent].insert(M);
+  return M;
+}
+
+DIMacroFile *DIBuilder::createTempMacroFile(DIMacroFile *Parent,
+                                            unsigned LineNumber, DIFile *File) {
+  auto *MF = DIMacroFile::getTemporary(VMContext, dwarf::DW_MACINFO_start_file,
+                                       LineNumber, File, DIMacroNodeArray())
+                 .release();
+  AllMacrosPerParent[Parent].insert(MF);
+  // Add the new temporary DIMacroFile to the macro per parent map as a parent.
+  // This is needed to assure DIMacroFile with no children to have an entry in
+  // the map. Otherwise, it will not be resolved in DIBuilder::finalize().
+  AllMacrosPerParent.insert({MF, {}});
+  return MF;
+}
+
 DIEnumerator *DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
   assert(!Name.empty() && "Unable to create enumerator without name");
   return DIEnumerator::get(VMContext, Val, Name);
@@ -509,6 +548,11 @@ DINodeArray DIBuilder::getOrCreateArray(ArrayRef<Metadata *> Elements) {
   return MDTuple::get(VMContext, Elements);
 }
 
+DIMacroNodeArray
+DIBuilder::getOrCreateMacroArray(ArrayRef<Metadata *> Elements) {
+  return MDTuple::get(VMContext, Elements);
+}
+
 DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
   SmallVector<llvm::Metadata *, 16> Elts;
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 31f89514151c..6f7356524d38 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "LLVMContextImpl.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -37,6 +38,10 @@ static_assert(sizeof(GlobalValue) ==
                   sizeof(Constant) + 2 * sizeof(void *) + 2 * sizeof(unsigned),
               "unexpected GlobalValue size growth");
 
+// GlobalObject adds a comdat.
+static_assert(sizeof(GlobalObject) == sizeof(GlobalValue) + sizeof(void *),
+              "unexpected GlobalObject size growth");
+
 bool GlobalValue::isMaterializable() const {
   if (const Function *F = dyn_cast<Function>(this))
     return F->isMaterializable();
@@ -160,11 +165,24 @@ Comdat *GlobalValue::getComdat() {
   return cast<GlobalObject>(this)->getComdat();
 }
 
-void GlobalObject::setSection(StringRef S) {
-  Section = S;
+StringRef GlobalObject::getSectionImpl() const {
+  assert(hasSection());
+  return getContext().pImpl->GlobalObjectSections[this];
+}
 
-  // The C api requires this to be null terminated.
-  Section.c_str();
+void GlobalObject::setSection(StringRef S) {
+  // Do nothing if we're clearing the section and it is already empty.
+  if (!hasSection() && S.empty())
+    return;
+
+  // Get or create a stable section name string and put it in the table in the
+  // context.
+  S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  getContext().pImpl->GlobalObjectSections[this] = S;
+
+  // Update the HasSectionHashEntryBit. Setting the section to the empty string
+  // means this global no longer has a section.
+  setGlobalObjectFlag(HasSectionHashEntryBit, !S.empty());
 }
 
 bool GlobalValue::isDeclaration() const {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index e9e30ef0656f..850c81cfabb2 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -1194,6 +1195,12 @@ public:
   /// Collection of per-GlobalObject metadata used in this context.
   DenseMap<const GlobalObject *, MDGlobalAttachmentMap> GlobalObjectMetadata;
 
+  /// Collection of per-GlobalObject sections used in this context.
+  DenseMap<const GlobalObject *, StringRef> GlobalObjectSections;
+
+  /// Stable collection of section strings.
+  StringSet<> SectionStrings;
+
   /// DiscriminatorTable - This table maps file:line locations to an
   /// integer representing the next DWARF path discriminator to assign to
   /// instructions in different blocks at the same location.
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 6342cbe4fd90..809db80bc916 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -17,7 +17,6 @@
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -36,6 +35,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 928f69a17de9..a14b86179d6e 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -284,7 +284,8 @@ public:
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedFunctions,
-      const DenseSet<GlobalValue::GUID> &PreservedSymbols) {
+      const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
+      const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
 
@@ -306,12 +307,42 @@ public:
 
     SHA1 Hasher;
 
+    // Include the parts of the LTO configuration that affect code generation.
+    auto AddString = [&](StringRef Str) {
+      Hasher.update(Str);
+      Hasher.update(ArrayRef<uint8_t>{0});
+    };
+    auto AddUnsigned = [&](unsigned I) {
+      uint8_t Data[4];
+      Data[0] = I;
+      Data[1] = I >> 8;
+      Data[2] = I >> 16;
+      Data[3] = I >> 24;
+      Hasher.update(ArrayRef<uint8_t>{Data, 4});
+    };
+
     // Start with the compiler revision
     Hasher.update(LLVM_VERSION_STRING);
 #ifdef HAVE_LLVM_REVISION
     Hasher.update(LLVM_REVISION);
 #endif
 
+    // Hash the optimization level and the target machine settings.
+    AddString(TMBuilder.MCpu);
+    // FIXME: Hash more of Options. For now all clients initialize Options from
+    // command-line flags (which is unsupported in production), but may set
+    // RelaxELFRelocations. The clang driver can also pass FunctionSections,
+    // DataSections and DebuggerTuning via command line flags.
+    AddUnsigned(TMBuilder.Options.RelaxELFRelocations);
+    AddUnsigned(TMBuilder.Options.FunctionSections);
+    AddUnsigned(TMBuilder.Options.DataSections);
+    AddUnsigned((unsigned)TMBuilder.Options.DebuggerTuning);
+    AddString(TMBuilder.MAttr);
+    if (TMBuilder.RelocModel)
+      AddUnsigned(*TMBuilder.RelocModel);
+    AddUnsigned(TMBuilder.CGOptLevel);
+    AddUnsigned(OptLevel);
+
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
     for (auto F : ExportList)
       // The export list can impact the internalization, be conservative here
@@ -928,7 +959,8 @@ void ThinLTOCodeGenerator::run() {
         ModuleCacheEntry CacheEntry(CacheOptions.Path, *Index, ModuleIdentifier,
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
-                                    DefinedFunctions, GUIDPreservedSymbols);
+                                    DefinedFunctions, GUIDPreservedSymbols,
+                                    OptLevel, TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index f1a7c1a5ade9..b895c3fcc050 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMObject
   ArchiveWriter.cpp
   Binary.cpp
   COFFObjectFile.cpp
+  Decompressor.cpp
   ELF.cpp
   ELFObjectFile.cpp
   Error.cpp
diff --git a/lib/Object/Decompressor.cpp b/lib/Object/Decompressor.cpp
new file mode 100644
index 000000000000..bca41fd9f487
--- /dev/null
+++ b/lib/Object/Decompressor.cpp
@@ -0,0 +1,102 @@
+//===-- Decompressor.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Decompressor.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace object;
+
+Expected<Decompressor> Decompressor::create(StringRef Name, StringRef Data,
+                                            bool IsLE, bool Is64Bit) {
+  if (!zlib::isAvailable())
+    return createError("zlib is not available");
+
+  Decompressor D(Data);
+  Error Err = isGnuStyle(Name) ? D.consumeCompressedGnuHeader()
+                               : D.consumeCompressedZLibHeader(Is64Bit, IsLE);
+  if (Err)
+    return std::move(Err);
+  return D;
+}
+
+Decompressor::Decompressor(StringRef Data)
+    : SectionData(Data), DecompressedSize(0) {}
+
+Error Decompressor::consumeCompressedGnuHeader() {
+  if (!SectionData.startswith("ZLIB"))
+    return createError("corrupted compressed section header");
+
+  SectionData = SectionData.substr(4);
+
+  // Consume uncompressed section size (big-endian 8 bytes).
+  if (SectionData.size() < 8)
+    return createError("corrupted uncompressed section size");
+  DecompressedSize = read64be(SectionData.data());
+  SectionData = SectionData.substr(8);
+
+  return Error::success();
+}
+
+Error Decompressor::consumeCompressedZLibHeader(bool Is64Bit,
+                                                bool IsLittleEndian) {
+  using namespace ELF;
+  uint64_t HdrSize = Is64Bit ? sizeof(Elf64_Chdr) : sizeof(Elf32_Chdr);
+  if (SectionData.size() < HdrSize)
+    return createError("corrupted compressed section header");
+
+  DataExtractor Extractor(SectionData, IsLittleEndian, 0);
+  uint32_t Offset = 0;
+  if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
+                                             : sizeof(Elf32_Word)) !=
+      ELFCOMPRESS_ZLIB)
+    return createError("unsupported compression type");
+
+  // Skip Elf64_Chdr::ch_reserved field.
+  if (Is64Bit)
+    Offset += sizeof(Elf64_Word);
+
+  DecompressedSize = Extractor.getUnsigned(
+      &Offset, Is64Bit ? sizeof(Elf64_Xword) : sizeof(Elf32_Word));
+  SectionData = SectionData.substr(HdrSize);
+  return Error::success();
+}
+
+bool Decompressor::isGnuStyle(StringRef Name) {
+  return Name.startswith(".zdebug");
+}
+
+bool Decompressor::isCompressed(const object::SectionRef &Section) {
+  StringRef Name;
+  if (Section.getName(Name))
+    return false;
+  return Section.isCompressed() || isGnuStyle(Name);
+}
+
+bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
+  return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name);
+}
+
+Error Decompressor::decompress(SmallString<32> &Out) {
+  Out.resize(DecompressedSize);
+  return decompress({Out.data(), (size_t)DecompressedSize});
+}
+
+Error Decompressor::decompress(MutableArrayRef<char> Buffer) {
+  size_t Size = Buffer.size();
+  zlib::Status Status = zlib::uncompress(SectionData, Buffer.data(), Size);
+  if (Status != zlib::StatusOK)
+    return createError("decompression failed");
+  return Error::success();
+}
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
index 42a448a7bdfd..014e63fe7d34 100644
--- a/lib/ObjectYAML/DWARFYAML.cpp
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -27,17 +27,18 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   IO.setContext(&DWARF);
   IO.mapOptional("debug_str", DWARF.DebugStrings);
   IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls);
-  if(!DWARF.ARanges.empty() || !IO.outputting())
+  if (!DWARF.ARanges.empty() || !IO.outputting())
     IO.mapOptional("debug_aranges", DWARF.ARanges);
-  if(!DWARF.PubNames.Entries.empty() || !IO.outputting())
+  if (!DWARF.PubNames.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_pubnames", DWARF.PubNames);
-  if(!DWARF.PubTypes.Entries.empty() || !IO.outputting())
+  if (!DWARF.PubTypes.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
-  if(!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
+  if (!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
-  if(!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
+  if (!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
   IO.mapOptional("debug_info", DWARF.CompileUnits);
+  IO.mapOptional("debug_line", DWARF.DebugLines);
   IO.setContext(&oldContext);
 }
 
@@ -62,7 +63,7 @@ void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
 }
 
 void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
-                                                DWARFYAML::ARange &Range) {
+                                               DWARFYAML::ARange &Range) {
   IO.mapRequired("Length", Range.Length);
   IO.mapRequired("Version", Range.Version);
   IO.mapRequired("CuOffset", Range.CuOffset);
@@ -106,15 +107,61 @@ void MappingTraits<DWARFYAML::Entry>::mapping(IO &IO, DWARFYAML::Entry &Entry) {
   IO.mapRequired("Values", Entry.Values);
 }
 
-void MappingTraits<DWARFYAML::FormValue>::mapping(IO &IO,
-                                             DWARFYAML::FormValue &FormValue) {
+void MappingTraits<DWARFYAML::FormValue>::mapping(
+    IO &IO, DWARFYAML::FormValue &FormValue) {
   IO.mapOptional("Value", FormValue.Value);
-  if(!FormValue.CStr.empty() || !IO.outputting())
+  if (!FormValue.CStr.empty() || !IO.outputting())
     IO.mapOptional("CStr", FormValue.CStr);
-  if(!FormValue.BlockData.empty() || !IO.outputting())
+  if (!FormValue.BlockData.empty() || !IO.outputting())
     IO.mapOptional("BlockData", FormValue.BlockData);
 }
 
+void MappingTraits<DWARFYAML::File>::mapping(IO &IO, DWARFYAML::File &File) {
+  IO.mapRequired("Name", File.Name);
+  IO.mapRequired("DirIdx", File.DirIdx);
+  IO.mapRequired("ModTime", File.ModTime);
+  IO.mapRequired("Length", File.Length);
+}
+
+void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
+    IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode) {
+  IO.mapRequired("Opcode", LineTableOpcode.Opcode);
+  if (LineTableOpcode.Opcode == dwarf::DW_LNS_extended_op) {
+    IO.mapRequired("ExtLen", LineTableOpcode.ExtLen);
+    IO.mapRequired("SubOpcode", LineTableOpcode.SubOpcode);
+  }
+
+  if (!LineTableOpcode.UnknownOpcodeData.empty() || !IO.outputting())
+    IO.mapOptional("UnknownOpcodeData", LineTableOpcode.UnknownOpcodeData);
+  if (!LineTableOpcode.UnknownOpcodeData.empty() || !IO.outputting())
+    IO.mapOptional("StandardOpcodeData", LineTableOpcode.StandardOpcodeData);
+  if (!LineTableOpcode.FileEntry.Name.empty() || !IO.outputting())
+    IO.mapOptional("FileEntry", LineTableOpcode.FileEntry);
+  if (LineTableOpcode.Opcode == dwarf::DW_LNS_advance_line || !IO.outputting())
+    IO.mapOptional("SData", LineTableOpcode.SData);
+  IO.mapOptional("Data", LineTableOpcode.Data);
+}
+
+void MappingTraits<DWARFYAML::LineTable>::mapping(
+    IO &IO, DWARFYAML::LineTable &LineTable) {
+  IO.mapRequired("TotalLength", LineTable.TotalLength);
+  if (LineTable.TotalLength == UINT32_MAX)
+    IO.mapRequired("TotalLength64", LineTable.TotalLength64);
+  IO.mapRequired("Version", LineTable.Version);
+  IO.mapRequired("PrologueLength", LineTable.PrologueLength);
+  IO.mapRequired("MinInstLength", LineTable.MinInstLength);
+  if(LineTable.Version >= 4)
+    IO.mapRequired("MaxOpsPerInst", LineTable.MaxOpsPerInst);
+  IO.mapRequired("DefaultIsStmt", LineTable.DefaultIsStmt);
+  IO.mapRequired("LineBase", LineTable.LineBase);
+  IO.mapRequired("LineRange", LineTable.LineRange);
+  IO.mapRequired("OpcodeBase", LineTable.OpcodeBase);
+  IO.mapRequired("StandardOpcodeLengths", LineTable.StandardOpcodeLengths);
+  IO.mapRequired("IncludeDirs", LineTable.IncludeDirs);
+  IO.mapRequired("Files", LineTable.Files);
+  IO.mapRequired("Opcodes", LineTable.Opcodes);
+}
+
 } // namespace llvm::yaml
 
 } // namespace llvm
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 6e0aae5fd852..2994a07b1ccf 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -94,14 +94,17 @@
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
@@ -220,7 +223,8 @@ public:
 
 /// \brief No-op loop pass which does nothing.
 struct NoOpLoopPass {
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &) {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
+                        LoopStandardAnalysisResults &, LPMUpdater &) {
     return PreservedAnalyses::all();
   }
   static StringRef name() { return "NoOpLoopPass"; }
@@ -233,7 +237,9 @@ class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
 
 public:
   struct Result {};
-  Result run(Loop &, LoopAnalysisManager &) { return Result(); }
+  Result run(Loop &, LoopAnalysisManager &, LoopStandardAnalysisResults &) {
+    return Result();
+  }
   static StringRef name() { return "NoOpLoopAnalysis"; }
 };
 
@@ -1019,7 +1025,9 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
     LPM.addPass(RequireAnalysisPass<                                           \
-                std::remove_reference<decltype(CREATE_PASS)>::type, Loop>());  \
+                std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
+                LoopAnalysisManager, LoopStandardAnalysisResults &,            \
+                LPMUpdater &>());                                              \
     return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 77c6ffc9c253..74acd9e5e207 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -811,4 +811,47 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
 
   return true;
 }
+
+// Check if INSTR_PROF_RAW_VERSION_VAR is defined.
+bool isIRPGOFlagSet(const Module *M) {
+  auto IRInstrVar =
+      M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+  if (!IRInstrVar || IRInstrVar->isDeclaration() ||
+      IRInstrVar->hasLocalLinkage())
+    return false;
+
+  // Check if the flag is set.
+  if (!IRInstrVar->hasInitializer())
+    return false;
+
+  const Constant *InitVal = IRInstrVar->getInitializer();
+  if (!InitVal)
+    return false;
+
+  return (dyn_cast<ConstantInt>(InitVal)->getZExtValue() &
+          VARIANT_MASK_IR_PROF) != 0;
+}
+
+// Check if we can safely rename this Comdat function.
+bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
+  if (F.getName().empty())
+    return false;
+  if (!needsComdatForCounter(F, *(F.getParent())))
+    return false;
+  // Unsafe to rename the address-taken function (which can be used in
+  // function comparison).
+  if (CheckAddressTaken && F.hasAddressTaken())
+    return false;
+  // Only safe to do if this function may be discarded if it is not used
+  // in the compilation unit.
+  if (!GlobalValue::isDiscardableIfUnused(F.getLinkage()))
+    return false;
+
+  // For AvailableExternallyLinkage functions.
+  if (!F.hasComdat()) {
+    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+    return true;
+  }
+  return true;
+}
 } // end namespace llvm
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 2c7bf0435d88..57e5a8d7871c 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Signals.h"
 #include <system_error>
 
@@ -28,8 +29,10 @@ using llvm::sys::fs::mapped_file_region;
 
 namespace llvm {
 FileOutputBuffer::FileOutputBuffer(std::unique_ptr<mapped_file_region> R,
-                                   StringRef Path, StringRef TmpPath)
-    : Region(std::move(R)), FinalPath(Path), TempPath(TmpPath) {}
+                                   StringRef Path, StringRef TmpPath,
+                                   bool IsRegular)
+    : Region(std::move(R)), FinalPath(Path), TempPath(TmpPath),
+      IsRegular(IsRegular) {}
 
 FileOutputBuffer::~FileOutputBuffer() {
   // Close the mapping before deleting the temp file, so that the removal
@@ -40,9 +43,10 @@ FileOutputBuffer::~FileOutputBuffer() {
 
 ErrorOr<std::unique_ptr<FileOutputBuffer>>
 FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
-  // If file already exists, it must be a regular file (to be mappable).
+  // Check file is not a regular file, in which case we cannot remove it.
   sys::fs::file_status Stat;
   std::error_code EC = sys::fs::status(FilePath, Stat);
+  bool IsRegular = true;
   switch (Stat.type()) {
     case sys::fs::file_type::file_not_found:
       // If file does not exist, we'll create one.
@@ -56,25 +60,34 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
     default:
       if (EC)
         return EC;
-      else
-        return make_error_code(errc::operation_not_permitted);
+      IsRegular = false;
   }
 
-  // Delete target file.
-  EC = sys::fs::remove(FilePath);
-  if (EC)
-    return EC;
-
-  unsigned Mode = sys::fs::all_read | sys::fs::all_write;
-  // If requested, make the output file executable.
-  if (Flags & F_executable)
-    Mode |= sys::fs::all_exe;
+  if (IsRegular) {
+    // Delete target file.
+    EC = sys::fs::remove(FilePath);
+    if (EC)
+      return EC;
+  }
 
-  // Create new file in same directory but with random name.
   SmallString<128> TempFilePath;
   int FD;
-  EC = sys::fs::createUniqueFile(Twine(FilePath) + ".tmp%%%%%%%", FD,
-                                 TempFilePath, Mode);
+  if (IsRegular) {
+    unsigned Mode = sys::fs::all_read | sys::fs::all_write;
+    // If requested, make the output file executable.
+    if (Flags & F_executable)
+      Mode |= sys::fs::all_exe;
+    // Create new file in same directory but with random name.
+    EC = sys::fs::createUniqueFile(Twine(FilePath) + ".tmp%%%%%%%", FD,
+                                   TempFilePath, Mode);
+  } else {
+    // Create a temporary file. Since this is a special file, we will not move
+    // it and the new file can be in another filesystem. This avoids trying to
+    // create a temporary file in /dev when outputting to /dev/null for example.
+    EC = sys::fs::createTemporaryFile(sys::path::filename(FilePath), "", FD,
+                                      TempFilePath);
+  }
+
   if (EC)
     return EC;
 
@@ -99,8 +112,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
   if (Ret)
     return std::error_code(errno, std::generic_category());
 
-  std::unique_ptr<FileOutputBuffer> Buf(
-      new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath));
+  std::unique_ptr<FileOutputBuffer> Buf(new FileOutputBuffer(
+      std::move(MappedFile), FilePath, TempFilePath, IsRegular));
   return std::move(Buf);
 }
 
@@ -108,10 +121,19 @@ std::error_code FileOutputBuffer::commit() {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
   Region.reset();
 
+  std::error_code EC;
+  if (IsRegular) {
+    // Rename file to final name.
+    EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+    sys::DontRemoveFileOnSignal(TempPath);
+  } else {
+    EC = sys::fs::copy_file(TempPath, FinalPath);
+    std::error_code RMEC = sys::fs::remove(TempPath);
+    sys::DontRemoveFileOnSignal(TempPath);
+    if (RMEC)
+      return RMEC;
+  }
 
-  // Rename file to final name.
-  std::error_code EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath));
-  sys::DontRemoveFileOnSignal(TempPath);
   return EC;
 }
 } // namespace
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 8a09589aa884..d1b40412a6fc 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -111,6 +111,7 @@ enum ProcessorTypes {
   AMDATHLON,
   AMDFAM14H,
   AMDFAM16H,
+  AMDFAM17H,
   CPU_TYPE_MAX
 };
 
@@ -149,6 +150,7 @@ enum ProcessorSubtypes {
   AMD_BTVER2,
   AMDFAM15H_BDVER3,
   AMDFAM15H_BDVER4,
+  AMDFAM17H_ZNVER1,
   CPU_SUBTYPE_MAX
 };
 
@@ -742,6 +744,14 @@ static void getAMDProcessorTypeAndSubtype(unsigned int Family,
     }
     *Subtype = AMD_BTVER2;
     break; // "btver2"
+  case 23:
+    *Type = AMDFAM17H;
+    if (Features & (1 << FEATURE_ADX)) {
+      *Subtype = AMDFAM17H_ZNVER1;
+      break; // "znver1"
+    }
+    *Subtype =  AMD_BTVER1;
+    break;
   default:
     break; // "generic"
   }
@@ -950,6 +960,15 @@ StringRef sys::getHostCPUName() {
       default:
         return "amdfam16";
       }
+    case AMDFAM17H:
+      switch (Subtype) {
+      case AMD_BTVER1:
+        return "btver1";
+      case AMDFAM17H_ZNVER1:
+        return "znver1";
+      default:
+        return "amdfam17";
+      }
     default:
       return "generic";
     }
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
index f79b364dc1f7..f06abf46cce4 100644
--- a/lib/Support/TarWriter.cpp
+++ b/lib/Support/TarWriter.cpp
@@ -54,6 +54,13 @@ struct UstarHeader {
 };
 static_assert(sizeof(UstarHeader) == BlockSize, "invalid Ustar header");
 
+static UstarHeader makeUstarHeader() {
+  UstarHeader Hdr = {};
+  memcpy(Hdr.Magic, "ustar", 5); // Ustar magic
+  memcpy(Hdr.Version, "00", 2);  // Ustar version
+  return Hdr;
+}
+
 // A PAX attribute is in the form of "<length> <key>=<value>\n"
 // where <length> is the length of the entire string including
 // the length field itself. An example string is this.
@@ -98,10 +105,9 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
   std::string PaxAttr = formatPax("path", Path);
 
   // Create a 512-byte header.
-  UstarHeader Hdr = {};
+  UstarHeader Hdr = makeUstarHeader();
   snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", PaxAttr.size());
-  Hdr.TypeFlag = 'x';            // PAX magic
-  memcpy(Hdr.Magic, "ustar", 6); // Ustar magic
+  Hdr.TypeFlag = 'x'; // PAX magic
   computeChecksum(Hdr);
 
   // Write them down.
@@ -116,7 +122,7 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
 static std::pair<StringRef, StringRef> splitPath(StringRef Path) {
   if (Path.size() <= sizeof(UstarHeader::Name))
     return {"", Path};
-  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1);
+  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Prefix) + 1);
   if (Sep == StringRef::npos)
     return {"", Path};
   return {Path.substr(0, Sep), Path.substr(Sep + 1)};
@@ -138,11 +144,10 @@ static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
   StringRef Name;
   std::tie(Prefix, Name) = splitPath(Path);
 
-  UstarHeader Hdr = {};
+  UstarHeader Hdr = makeUstarHeader();
   memcpy(Hdr.Name, Name.data(), Name.size());
   memcpy(Hdr.Mode, "0000664", 8);
   snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
-  memcpy(Hdr.Magic, "ustar", 6);
   memcpy(Hdr.Prefix, Prefix.data(), Prefix.size());
   computeChecksum(Hdr);
   OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index e927d58ad612..d472a54d9543 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -18,9 +18,132 @@
 namespace llvm {
 namespace AArch64 {
 
-RegisterBank GPRRegBank;
-RegisterBank FPRRegBank;
-RegisterBank CCRRegBank;
+const uint32_t GPRCoverageData[] = {
+    // Classes 0-31
+    (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
+        (1u << AArch64::GPR32spRegClassID) |
+        (1u << AArch64::GPR32commonRegClassID) |
+        (1u << AArch64::GPR32sponlyRegClassID) |
+        (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
+        (1u << AArch64::GPR64spRegClassID) |
+        (1u << AArch64::GPR64commonRegClassID) |
+        (1u << AArch64::tcGPR64RegClassID) |
+        (1u << AArch64::GPR64sponlyRegClassID),
+    // Classes 32-63
+    0,
+    // FIXME: The entries below this point can be safely removed once this is
+    // tablegenerated. It's only needed because of the hardcoded register class
+    // limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+const uint32_t FPRCoverageData[] = {
+    // Classes 0-31
+    (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
+        (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
+        (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
+        (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
+        (1u << AArch64::DDDDRegClassID),
+    // Classes 32-63
+    (1u << (AArch64::QQRegClassID - 32)) |
+        (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u << (AArch64::QQQQRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
+             32)) |
+        (1u << (AArch64::QQQRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
+             32)),
+    // FIXME: The entries below this point can be safely removed once this
+    // is tablegenerated. It's only needed because of the hardcoded register
+    // class limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+const uint32_t CCRCoverageData[] = {
+    // Classes 0-31
+    1u << AArch64::CCRRegClassID,
+    // Classes 32-63
+    0,
+    // FIXME: The entries below this point can be safely removed once this
+    // is tablegenerated. It's only needed because of the hardcoded register
+    // class limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
+RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
+RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
 
 RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 74a01835171b..7b581a706fa2 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -159,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::f32, Custom);
   setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index c5b95f282ea8..2244baacca17 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -951,10 +951,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
 defm CLS    : OneOperandData<0b101, "cls">;
 defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-
-def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
-def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+defm RBIT   : OneOperandData<0b000, "rbit", bitreverse>;
 
 def  REV16Wr : OneWRegData<0b001, "rev16",
                                   UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index a5fd2fbdde19..b292c9c87dcd 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -41,28 +41,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   if (AlreadyInit)
     return;
   AlreadyInit = true;
-  // Initialize the GPR bank.
-  createRegisterBank(AArch64::GPRRegBankID, "GPR");
-  // The GPR register bank is fully defined by all the registers in
-  // GR64all + its subclasses.
-  addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+
   const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
   (void)RBGPR;
   assert(&AArch64::GPRRegBank == &RBGPR &&
          "The order in RegBanks is messed up");
+
+  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+  (void)RBFPR;
+  assert(&AArch64::FPRRegBank == &RBFPR &&
+         "The order in RegBanks is messed up");
+
+  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+  (void)RBCCR;
+  assert(&AArch64::CCRRegBank == &RBCCR &&
+         "The order in RegBanks is messed up");
+
+  // The GPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
   assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
 
-  // Initialize the FPR bank.
-  createRegisterBank(AArch64::FPRRegBankID, "FPR");
   // The FPR register bank is fully defined by all the registers in
   // GR64all + its subclasses.
-  addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
-  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
-  (void)RBFPR;
-  assert(&AArch64::FPRRegBank == &RBFPR &&
-         "The order in RegBanks is messed up");
   assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
          "Subclass not added?");
   assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
@@ -70,13 +72,6 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(RBFPR.getSize() == 512 &&
          "FPRs should hold up to 512-bit via QQQQ sequence");
 
-  // Initialize the CCR bank.
-  createRegisterBank(AArch64::CCRRegBankID, "CCR");
-  addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
-  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
-  (void)RBCCR;
-  assert(&AArch64::CCRRegBank == &RBCCR &&
-         "The order in RegBanks is messed up");
   assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
          "Class not added?");
   assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1a17691fc584..b8833e5a5552 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 int AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -466,28 +466,27 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     unsigned Alignment, unsigned AddressSpace) {
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
-      Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
+      LT.second.is128BitVector() && Alignment < 16) {
+    // Unaligned stores are extremely inefficient. We don't split all
+    // unaligned 128-bit stores because the negative impact that has shown in
+    // practice on inlined block copy code.
+    // We make such stores expensive so that we will only vectorize if there
     // are 6 other instructions getting vectorized.
-    int AmortizationCost = 6;
+    const int AmortizationCost = 6;
 
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
+  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
+      Ty->getVectorNumElements() < 8) {
     // We scalarize the loads/stores because there is not v.4b register and we
     // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVecElts = Ty->getVectorNumElements();
     unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
     // We generate 2 instructions per vector element.
     return NumVectorizableInstsToAmortize * NumVecElts * 2;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 849fd3d9b44a..18287ed6653f 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -102,7 +102,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 730bcdcf7afa..e48c1943cb01 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
+
+  // FIXME: This is only partially true. If we have to do vector compares, any
+  // SGPR pair can be a condition register. If we have a uniform condition, we
+  // are better off doing SALU operations, where there is only one SCC. For now,
+  // we don't have a way of knowing during instruction selection if a condition
+  // will be uniform and we always use vector compares. Assume we are using
+  // vector compares until that is fixed.
   setHasMultipleConditionRegisters(true);
 
   // SI at least has hardware support for floating point exceptions, but no way
@@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FNEG);
 }
 
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
 
+static bool fnegFoldsIntoOp(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FSIN:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMUL_LEGACY:
+    return true;
+  default:
+    return false;
+  }
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
   return SDValue();
 }
 
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                         unsigned Op,
+                                         const SDLoc &SL,
+                                         SDValue Cond,
+                                         SDValue N1,
+                                         SDValue N2) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N1.getValueType();
+
+  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+                                  N1.getOperand(0), N2.getOperand(0));
+  DCI.AddToWorklist(NewSelect.getNode());
+  return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                    SDValue N) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Cond = N.getOperand(0);
+  SDValue LHS = N.getOperand(1);
+  SDValue RHS = N.getOperand(2);
+
+  EVT VT = N.getValueType();
+  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+                                     SDLoc(N), Cond, LHS, RHS);
+  }
+
+  bool Inv = false;
+  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+    std::swap(LHS, RHS);
+    Inv = true;
+  }
+
+  // TODO: Support vector constants.
+  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+    SDLoc SL(N);
+    // If one side is an fneg/fabs and the other is a constant, we can push the
+    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+    SDValue NewLHS = LHS.getOperand(0);
+    SDValue NewRHS = RHS;
+
+    // Careful: if the neg can be folded up, don't try to pull it back down.
+    bool ShouldFoldNeg = true;
+
+    if (NewLHS.hasOneUse()) {
+      unsigned Opc = NewLHS.getOpcode();
+      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+        ShouldFoldNeg = false;
+      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+        ShouldFoldNeg = false;
+    }
+
+    if (ShouldFoldNeg) {
+      if (LHS.getOpcode() == ISD::FNEG)
+        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      else if (CRHS->isNegative())
+        return SDValue();
+
+      if (Inv)
+        std::swap(NewLHS, NewRHS);
+
+      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+                                      Cond, NewLHS, NewRHS);
+      DCI.AddToWorklist(NewSelect.getNode());
+      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+    }
+  }
+
+  return SDValue();
+}
+
+
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
+  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+    return Folded;
+
   SDValue Cond = N->getOperand(0);
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
@@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Opc = N0.getOpcode();
+
+  // If the input has multiple uses and we can either fold the negate down, or
+  // the other uses cannot, give up. This both prevents unprofitable
+  // transformations and infinite loops: we won't repeatedly try to fold around
+  // a negate that has no 'good' form.
+  //
+  // TODO: Check users can fold
+  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+    return SDValue();
+
+  SDLoc SL(N);
+  switch (Opc) {
+  case ISD::FADD: {
+    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    if (LHS.getOpcode() != ISD::FNEG)
+      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+    else
+      LHS = LHS.getOperand(0);
+
+    if (RHS.getOpcode() != ISD::FNEG)
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    else
+      RHS = RHS.getOperand(0);
+
+    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FMUL:
+  case AMDGPUISD::FMUL_LEGACY: {
+    // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    if (LHS.getOpcode() == ISD::FNEG)
+      LHS = LHS.getOperand(0);
+    else if (RHS.getOpcode() == ISD::FNEG)
+      RHS = RHS.getOperand(0);
+    else
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FMA:
+  case ISD::FMAD: {
+    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+    SDValue LHS = N0.getOperand(0);
+    SDValue MHS = N0.getOperand(1);
+    SDValue RHS = N0.getOperand(2);
+
+    if (LHS.getOpcode() == ISD::FNEG)
+      LHS = LHS.getOperand(0);
+    else if (MHS.getOpcode() == ISD::FNEG)
+      MHS = MHS.getOperand(0);
+    else
+      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+    if (RHS.getOpcode() != ISD::FNEG)
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    else
+      RHS = RHS.getOperand(0);
+
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FP_EXTEND:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_LEGACY:
+  case ISD::FSIN:
+  case AMDGPUISD::SIN_HW: {
+    SDValue CvtSrc = N0.getOperand(0);
+    if (CvtSrc.getOpcode() == ISD::FNEG) {
+      // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+      // (fneg (rcp (fneg x))) -> (rcp x)
+      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+    }
+
+    if (!N0.hasOneUse())
+      return SDValue();
+
+    // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+    // (fneg (rcp x)) -> (rcp (fneg x))
+    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+    return DAG.getNode(Opc, SL, VT, Neg);
+  }
+  case ISD::FP_ROUND: {
+    SDValue CvtSrc = N0.getOperand(0);
+
+    if (CvtSrc.getOpcode() == ISD::FNEG) {
+      // (fneg (fp_round (fneg x))) -> (fp_round x)
+      return DAG.getNode(ISD::FP_ROUND, SL, VT,
+                         CvtSrc.getOperand(0), N0.getOperand(1));
+    }
+
+    if (!N0.hasOneUse())
+      return SDValue();
+
+    // (fneg (fp_round x)) -> (fp_round (fneg x))
+    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performMulLoHi24Combine(N, DCI);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
+  case ISD::FNEG:
+    return performFNegCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 745c9923de2e..69567aa5f713 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,7 @@ protected:
   SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 513df3a9cdf3..59cba636c586 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
-def sub_oneuse : HasOneUseBinOp<sub>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def sub_oneuse : HasOneUseBinOp<sub>;
+
 def select_oneuse : HasOneUseTernaryOp<select>;
 
 // Special conversion patterns
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a1a352642242..e90487065992 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 int AMDGPUTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
 
   EVT OrigTy = TLI->getValueType(DL, Ty);
   if (!OrigTy.isSimple()) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1177007644ff..0d83b2a585bf 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@ public:
     TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   unsigned getCFInstrCost(unsigned Opcode);
 
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index da9d009c542b..3cf9a1d92469 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -214,7 +214,7 @@ public:
   }
 
   bool isReg() const override {
-    return isRegKind() && !Reg.Mods.hasModifiers();
+    return isRegKind() && !hasModifiers();
   }
 
   bool isRegOrImmWithInputMods(MVT type) const {
@@ -245,6 +245,15 @@ public:
     return isRegOrImmWithInputMods(MVT::f64);
   }
 
+  bool isVReg() const {
+    return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+           isRegClass(AMDGPU::VReg_64RegClassID) ||
+           isRegClass(AMDGPU::VReg_96RegClassID) ||
+           isRegClass(AMDGPU::VReg_128RegClassID) ||
+           isRegClass(AMDGPU::VReg_256RegClassID) ||
+           isRegClass(AMDGPU::VReg_512RegClassID);
+  }
+
   bool isVReg32OrOff() const {
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
@@ -299,28 +308,32 @@ public:
 
   bool isRegClass(unsigned RCID) const;
 
+  bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
+    return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+  }
+
   bool isSCSrcB16() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
   }
 
   bool isSCSrcB32() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
   }
 
   bool isSCSrcB64() const {
-    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+    return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
   }
 
   bool isSCSrcF16() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
   }
 
   bool isSCSrcF32() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
   }
 
   bool isSCSrcF64() const {
-    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+    return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64);
   }
 
   bool isSSrcB32() const {
@@ -350,27 +363,27 @@ public:
   }
 
   bool isVCSrcB32() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
 
   bool isVCSrcB64() const {
-    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+    return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
   bool isVCSrcB16() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
   bool isVCSrcF32() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
 
   bool isVCSrcF64() const {
-    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+    return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
   bool isVCSrcF16() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
   bool isVSrcB32() const {
@@ -534,6 +547,23 @@ public:
     addRegOrImmWithInputModsOperands(Inst, N);
   }
 
+  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Modifiers Mods = getModifiers();
+    Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+    assert(isRegKind());
+    addRegOperands(Inst, N);
+  }
+
+  void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasIntModifiers());
+    addRegWithInputModsOperands(Inst, N);
+  }
+
+  void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasFPModifiers());
+    addRegWithInputModsOperands(Inst, N);
+  }
+
   void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
     if (isImm())
       addImmOperands(Inst, N);
@@ -852,9 +882,12 @@ public:
                                              StringRef &Value);
 
   OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseReg(OperandVector &Operands);
   OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
+  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
+  OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
@@ -1057,7 +1090,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isRegClass(unsigned RCID) const {
-  return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+  return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
 void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
@@ -1468,23 +1501,28 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
-  auto res = parseImm(Operands);
-  if (res != MatchOperand_NoMatch) {
-    return res;
-  }
-
+AMDGPUAsmParser::parseReg(OperandVector &Operands) {
   if (auto R = parseRegister()) {
     assert(R->isReg());
     R->Reg.IsForcedVOP3 = isForcedVOP3();
     Operands.push_back(std::move(R));
     return MatchOperand_Success;
   }
-  return MatchOperand_ParseFail;
+  return MatchOperand_NoMatch;
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+  auto res = parseImm(Operands);
+  if (res != MatchOperand_NoMatch) {
+    return res;
+  }
+
+  return parseReg(Operands);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
   // XXX: During parsing we can't determine if minus sign means
   // negate-modifier or negative immediate value.
   // By default we suppose it is modifier.
@@ -1514,7 +1552,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
     Abs = true;
   }
 
-  auto Res = parseRegOrImm(Operands);
+  OperandMatchResultTy Res;
+  if (AllowImm) {
+    Res = parseRegOrImm(Operands);
+  } else {
+    Res = parseReg(Operands);
+  }
   if (Res != MatchOperand_Success) {
     return Res;
   }
@@ -1548,7 +1591,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
   bool Sext = false;
 
   if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
@@ -1561,7 +1604,12 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
     Parser.Lex();
   }
 
-  auto Res = parseRegOrImm(Operands);
+  OperandMatchResultTy Res;
+  if (AllowImm) {
+    Res = parseRegOrImm(Operands);
+  } else {
+    Res = parseReg(Operands);
+  }
   if (Res != MatchOperand_Success) {
     return Res;
   }
@@ -1584,6 +1632,16 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+  return parseRegOrImmWithFPInputMods(Operands, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+  return parseRegOrImmWithIntInputMods(Operands, false);
+}
+
 OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
   std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
   if (Reg) {
@@ -3382,7 +3440,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
       // Skip it.
       continue;
     } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+      Op.addRegWithFPInputModsOperands(Inst, 2);
     } else if (Op.isDPPCtrl()) {
       Op.addImmOperands(Inst, 1);
     } else if (Op.isImm()) {
@@ -3508,7 +3566,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       // Skip it.
       continue;
     } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegOrImmWithInputModsOperands(Inst, 2);
+      Op.addRegWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
       OptionalIdx[Op.getImmTy()] = I;
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 4112ad100584..48c6592ca5b2 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -333,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
 
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
 def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 831ac5948a68..a5c0d4923d6b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,25 +25,6 @@ using namespace llvm;
 
 namespace {
 
-class SIFoldOperands : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFoldOperands() : MachineFunctionPass(ID) {
-    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Fold Operands"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
 struct FoldCandidate {
   MachineInstr *UseMI;
   union {
@@ -79,6 +60,36 @@ struct FoldCandidate {
   }
 };
 
+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineRegisterInfo *MRI;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
+  void foldOperand(MachineOperand &OpToFold,
+                   MachineInstr *UseMI,
+                   unsigned UseOpIdx,
+                   SmallVectorImpl<FoldCandidate> &FoldList,
+                   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fold Operands"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // End anonymous namespace.
 
 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@@ -88,6 +99,34 @@ char SIFoldOperands::ID = 0;
 
 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
 
+// Wrapper around isInlineConstant that understands special cases when
+// instruction types are replaced during operand folding.
+static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
+                                     const MachineInstr &UseMI,
+                                     unsigned OpNo,
+                                     const MachineOperand &OpToFold) {
+  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
+    return true;
+
+  unsigned Opc = UseMI.getOpcode();
+  switch (Opc) {
+  case AMDGPU::V_MAC_F32_e64:
+  case AMDGPU::V_MAC_F16_e64: {
+    // Special case for mac. Since this is replaced with mad when folded into
+    // src2, we need to check the legality for the final instruction.
+    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+    if (static_cast<int>(OpNo) == Src2Idx) {
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      const MCInstrDesc &MadDesc
+        = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+      return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
+    }
+  }
+  default:
+    return false;
+  }
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -141,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold,
   return false;
 }
 
-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
                               const MachineInstr *MI) {
   for (auto Candidate : FoldList) {
     if (Candidate.UseMI == MI)
@@ -150,7 +189,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
   return false;
 }
 
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
@@ -160,7 +199,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
-      bool IsF32  = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
 
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
@@ -227,12 +266,12 @@ static bool isUseSafeToFold(const MachineInstr &MI,
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
-                        unsigned UseOpIdx,
-                        std::vector<FoldCandidate> &FoldList,
-                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
-                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
-                        MachineRegisterInfo &MRI) {
+void SIFoldOperands::foldOperand(
+  MachineOperand &OpToFold,
+  MachineInstr *UseMI,
+  unsigned UseOpIdx,
+  SmallVectorImpl<FoldCandidate> &FoldList,
+  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
 
   if (!isUseSafeToFold(*UseMI, UseOp))
@@ -264,7 +303,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
     for (MachineRegisterInfo::use_iterator
-           RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+           RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
          RSUse != RSE; ++RSUse) {
 
       MachineInstr *RSUseMI = RSUse->getParent();
@@ -272,7 +311,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
         continue;
 
       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
-                  CopiesToReplace, TII, TRI, MRI);
+                  CopiesToReplace);
     }
 
     return;
@@ -287,8 +326,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI.getRegClass(DestReg) :
-      TRI.getPhysRegClass(DestReg);
+      MRI->getRegClass(DestReg) :
+      TRI->getPhysRegClass(DestReg);
 
     unsigned MovOp = TII->getMovOpcode(DestRC);
     if (MovOp == AMDGPU::COPY)
@@ -318,7 +357,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 
   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
   const TargetRegisterClass *FoldRC =
-    TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+    TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
 
   APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
             OpToFold.getImm());
@@ -328,8 +367,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned UseReg = UseOp.getReg();
     const TargetRegisterClass *UseRC
       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-      MRI.getRegClass(UseReg) :
-      TRI.getPhysRegClass(UseReg);
+      MRI->getRegClass(UseReg) :
+      TRI->getPhysRegClass(UseReg);
 
     assert(Imm.getBitWidth() == 64);
 
@@ -349,20 +388,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 }
 
 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
-                                  int32_t LHS, int32_t RHS) {
+                                  uint32_t LHS, uint32_t RHS) {
   switch (Opcode) {
   case AMDGPU::V_AND_B32_e64:
+  case AMDGPU::V_AND_B32_e32:
   case AMDGPU::S_AND_B32:
     Result = LHS & RHS;
     return true;
   case AMDGPU::V_OR_B32_e64:
+  case AMDGPU::V_OR_B32_e32:
   case AMDGPU::S_OR_B32:
     Result = LHS | RHS;
     return true;
   case AMDGPU::V_XOR_B32_e64:
+  case AMDGPU::V_XOR_B32_e32:
   case AMDGPU::S_XOR_B32:
     Result = LHS ^ RHS;
     return true;
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+    // The instruction ignores the high bits for out of bounds shifts.
+    Result = LHS << (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+    Result = RHS << (LHS & 31);
+    return true;
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::S_LSHR_B32:
+    Result = LHS >> (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+    Result = RHS >> (LHS & 31);
+    return true;
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::S_ASHR_I32:
+    Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+    return true;
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+    Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+    return true;
   default:
     return false;
   }
@@ -390,33 +460,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
   stripExtraCopyOperands(MI);
 }
 
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+                                               MachineOperand &Op) {
+  if (Op.isReg()) {
+    // If this has a subregister, it obviously is a register source.
+    if (Op.getSubReg() != AMDGPU::NoSubRegister)
+      return &Op;
+
+    MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+    if (Def->isMoveImmediate()) {
+      MachineOperand &ImmSrc = Def->getOperand(1);
+      if (ImmSrc.isImm())
+        return &ImmSrc;
+    }
+  }
+
+  return &Op;
+}
+
 // Try to simplify operations with a constant that may appear after instruction
 // selection.
+// TODO: See if a frame index with a fixed offset can fold.
 static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
                               const SIInstrInfo *TII,
-                              MachineInstr *MI) {
+                              MachineInstr *MI,
+                              MachineOperand *ImmOp) {
   unsigned Opc = MI->getOpcode();
-
   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
       Opc == AMDGPU::S_NOT_B32) {
-    MachineOperand &Src0 = MI->getOperand(1);
-    if (Src0.isImm()) {
-      Src0.setImm(~Src0.getImm());
-      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
-      return true;
-    }
-
-    return false;
+    MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+    return true;
   }
 
-  if (!MI->isCommutable())
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
     return false;
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
 
-  MachineOperand *Src0 = &MI->getOperand(Src0Idx);
-  MachineOperand *Src1 = &MI->getOperand(Src1Idx);
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
@@ -431,19 +515,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
 
-    Src0->setImm(NewImm);
+    // Be careful to change the right operand, src0 may belong to a different
+    // instruction.
+    MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     MI->RemoveOperand(Src1Idx);
     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     return true;
   }
 
+  if (!MI->isCommutable())
+    return false;
+
   if (Src0->isImm() && !Src1->isImm()) {
     std::swap(Src0, Src1);
     std::swap(Src0Idx, Src1Idx);
   }
 
   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
-  if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+  if (Opc == AMDGPU::V_OR_B32_e64 ||
+      Opc == AMDGPU::V_OR_B32_e32 ||
+      Opc == AMDGPU::S_OR_B32) {
     if (Src1Val == 0) {
       // y = or x, 0 => y = copy x
       MI->RemoveOperand(Src1Idx);
@@ -459,6 +550,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   }
 
   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
       MI->getOpcode() == AMDGPU::S_AND_B32) {
     if (Src1Val == 0) {
       // y = and x, 0 => y = v_mov_b32 0
@@ -476,29 +568,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   }
 
   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     if (Src1Val == 0) {
       // y = xor x, 0 => y = copy x
       MI->RemoveOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      return true;
     }
   }
 
   return false;
 }
 
+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+                                     MachineOperand &OpToFold) const {
+  // We need mutate the operands of new mov instructions to add implicit
+  // uses of EXEC, but adding them invalidates the use_iterator, so defer
+  // this.
+  SmallVector<MachineInstr *, 4> CopiesToReplace;
+  SmallVector<FoldCandidate, 4> FoldList;
+  MachineOperand &Dst = MI.getOperand(0);
+
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  if (FoldingImm) {
+    unsigned NumLiteralUses = 0;
+    MachineOperand *NonInlineUse = nullptr;
+    int NonInlineUseOpNo = -1;
+
+    MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; Use = NextUse) {
+      NextUse = std::next(Use);
+      MachineInstr *UseMI = Use->getParent();
+      unsigned OpNo = Use.getOperandNo();
+
+      // Folding the immediate may reveal operations that can be constant
+      // folded or replaced with a copy. This can happen for example after
+      // frame indices are lowered to constants or from splitting 64-bit
+      // constants.
+      //
+      // We may also encounter cases where one or both operands are
+      // immediates materialized into a register, which would ordinarily not
+      // be folded due to multiple uses or operand constraints.
+
+      if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+        DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+        // Some constant folding cases change the same immediate's use to a new
+        // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+        // again. The same constant folded instruction could also have a second
+        // use operand.
+        NextUse = MRI->use_begin(Dst.getReg());
+        continue;
+      }
+
+      // Try to fold any inline immediate uses, and then only fold other
+      // constants if they have one use.
+      //
+      // The legality of the inline immediate must be checked based on the use
+      // operand, not the defining instruction, because 32-bit instructions
+      // with 32-bit inline immediate sources may be used to materialize
+      // constants used in 16-bit operands.
+      //
+      // e.g. it is unsafe to fold:
+      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
+      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+      // Folding immediates with more than one use will increase program size.
+      // FIXME: This will also reduce register usage, which may be better
+      // in some cases. A better heuristic is needed.
+      if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else {
+        if (++NumLiteralUses == 1) {
+          NonInlineUse = &*Use;
+          NonInlineUseOpNo = OpNo;
+        }
+      }
+    }
+
+    if (NumLiteralUses == 1) {
+      MachineInstr *UseMI = NonInlineUse->getParent();
+      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+    }
+  } else {
+    // Folding register.
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; ++Use) {
+      MachineInstr *UseMI = Use->getParent();
+
+      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+                  FoldList, CopiesToReplace);
+    }
+  }
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  // Make sure we add EXEC uses to any new v_mov instructions created.
+  for (MachineInstr *Copy : CopiesToReplace)
+    Copy->addImplicitDefUseOperands(*MF);
+
+  for (FoldCandidate &Fold : FoldList) {
+    if (updateOperand(Fold, *TRI)) {
+      // Clear kill flags.
+      if (Fold.isReg()) {
+        assert(Fold.OpToFold && Fold.OpToFold->isReg());
+        // FIXME: Probably shouldn't bother trying to fold if not an
+        // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+        // copies.
+        MRI->clearKillFlags(Fold.OpToFold->getReg());
+      }
+      DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+            static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+    }
+  }
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+       BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
     MachineBasicBlock::iterator I, Next;
@@ -512,8 +711,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
 
-      // FIXME: We could also be folding things like FrameIndexes and
-      // TargetIndexes.
+      // FIXME: We could also be folding things like TargetIndexes.
       if (!FoldingImm && !OpToFold.isReg())
         continue;
 
@@ -532,90 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
         continue;
 
-      // We need mutate the operands of new mov instructions to add implicit
-      // uses of EXEC, but adding them invalidates the use_iterator, so defer
-      // this.
-      SmallVector<MachineInstr *, 4> CopiesToReplace;
-
-      std::vector<FoldCandidate> FoldList;
-      if (FoldingImm) {
-        unsigned NumLiteralUses = 0;
-        MachineOperand *NonInlineUse = nullptr;
-        int NonInlineUseOpNo = -1;
-
-        // Try to fold any inline immediate uses, and then only fold other
-        // constants if they have one use.
-        //
-        // The legality of the inline immediate must be checked based on the use
-        // operand, not the defining instruction, because 32-bit instructions
-        // with 32-bit inline immediate sources may be used to materialize
-        // constants used in 16-bit operands.
-        //
-        // e.g. it is unsafe to fold:
-        //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
-        //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
-        // Folding immediates with more than one use will increase program size.
-        // FIXME: This will also reduce register usage, which may be better
-        // in some cases. A better heuristic is needed.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-          unsigned OpNo = Use.getOperandNo();
-
-          if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
-            foldOperand(OpToFold, UseMI, OpNo, FoldList,
-                        CopiesToReplace, TII, TRI, MRI);
-          } else {
-            if (++NumLiteralUses == 1) {
-              NonInlineUse = &*Use;
-              NonInlineUseOpNo = OpNo;
-            }
-          }
-        }
-
-        if (NumLiteralUses == 1) {
-          MachineInstr *UseMI = NonInlineUse->getParent();
-          foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      } else {
-        // Folding register.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-
-          foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      }
-
-      // Make sure we add EXEC uses to any new v_mov instructions created.
-      for (MachineInstr *Copy : CopiesToReplace)
-        Copy->addImplicitDefUseOperands(MF);
-
-      for (FoldCandidate &Fold : FoldList) {
-        if (updateOperand(Fold, TRI)) {
-          // Clear kill flags.
-          if (Fold.isReg()) {
-            assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            // FIXME: Probably shouldn't bother trying to fold if not an
-            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
-            // copies.
-            MRI.clearKillFlags(Fold.OpToFold->getReg());
-          }
-          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-                static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
-
-          // Folding the immediate may reveal operations that can be constant
-          // folded or replaced with a copy. This can happen for example after
-          // frame indices are lowered to constants or from splitting 64-bit
-          // constants.
-          tryConstantFoldOp(MRI, TII, Fold.UseMI);
-        }
-      }
+      foldInstOperand(MI, OpToFold);
     }
   }
   return false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 34096e158039..ebaefae3bfef 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -557,6 +557,27 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
+def FPVRegInputModsMatchClass : AsmOperandClass {
+  let Name = "VRegWithFPInputMods";
+  let ParserMethod = "parseRegWithFPInputMods";
+  let PredicateMethod = "isVReg";
+}
+
+def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def IntVRegInputModsMatchClass : AsmOperandClass {
+  let Name = "VRegWithIntInputMods";
+  let ParserMethod = "parseRegWithIntInputMods";
+  let PredicateMethod = "isVReg";
+}
+
+def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndIntInputMods";
+}
+
+
 //===----------------------------------------------------------------------===//
 // Complex patterns
 //===----------------------------------------------------------------------===//
@@ -761,6 +782,15 @@ class getSrcMod <ValueType VT> {
                      );
 }
 
+// Return type of input modifiers operand specified input operand for SDWA/DPP
+class getSrcModExt <ValueType VT> {
+    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+               !if(!eq(VT.Value, f32.Value), 1,
+               !if(!eq(VT.Value, f64.Value), 1,
+               0)));
+  Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
+}
+
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
 class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
@@ -1001,6 +1031,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+  field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
+  field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+  field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
+  field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+  
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
@@ -1038,15 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
-                               HasModifiers, Src0Mod, Src1Mod>.ret;
+                               HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
-                                 HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+                                 HasModifiers, Src0ModSDWA, Src1ModSDWA,
+                                 DstVT>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bc35c2edc8d3..b86c04191189 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -871,6 +871,11 @@ def : Pat <
 >;
 
 def : Pat <
+  (i16 (sext_inreg i16:$src, i1)),
+  (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
+>;
+
+def : Pat <
   (i16 (sext_inreg i16:$src, i8)),
   (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
 >;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b27d7c691032..dd31dc690840 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -84,12 +84,17 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
   // a special case for it.  It can only be shrunk if the third operand
   // is vcc.  We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrining
+  // a register allocation hint pre-regalloc and then do the shrinking
   // post-regalloc.
   if (Src2) {
     switch (MI.getOpcode()) {
       default: return false;
 
+      case AMDGPU::V_ADDC_U32_e64:
+      case AMDGPU::V_SUBB_U32_e64:
+        // Additional verification is needed for sdst/src2.
+        return true;
+
       case AMDGPU::V_MAC_F32_e64:
       case AMDGPU::V_MAC_F16_e64:
         if (!isVGPR(Src2, TRI, MRI) ||
@@ -174,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
                                    const MachineOperand &Orig) {
 
   for (MachineOperand &Use : MI.implicit_operands()) {
-    if (Use.getReg() == AMDGPU::VCC) {
+    if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
       Use.setIsUndef(Orig.isUndef());
       Use.setIsKill(Orig.isKill());
       return;
@@ -456,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
+      // Check for the bool flag output for instructions like V_ADD_I32_e64.
+      const MachineOperand *SDst = TII->getNamedOperand(MI,
+                                                        AMDGPU::OpName::sdst);
+
+      // Check the carry-in operand for v_addc_u32_e64.
+      const MachineOperand *Src2 = TII->getNamedOperand(MI,
+                                                        AMDGPU::OpName::src2);
+
+      if (SDst) {
+        if (SDst->getReg() != AMDGPU::VCC) {
+          if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+            MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+
+        // All of the instructions with carry outs also have an SGPR input in
+        // src2.
+        if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+          if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+            MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+
+          continue;
+        }
+      }
+
       // We can shrink this instruction
       DEBUG(dbgs() << "Shrinking " << MI);
 
@@ -481,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (Src1)
         Inst32.addOperand(*Src1);
 
-      const MachineOperand *Src2 =
-        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
       if (Src2) {
         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
         if (Op32Src2Idx != -1) {
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index bff706cdc1dc..a15b9ceff2f4 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -232,7 +232,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
 
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 20fb7f7bcab7..00e5ab3db0b7 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -183,13 +183,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                        HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
-  let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
-                    FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+                    Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                     VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
-                     FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      VGPR_32:$src2, // stub argument
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index c431d9db801e..16a456da3c67 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -517,8 +517,8 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$sdst, $src0_modifiers, $src1";
-  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
-                     Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index afba1587a743..32b7c87e61bb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -608,15 +608,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
   // a __gnu_ prefix (which is the default).
   if (Subtarget->isTargetAEABI()) {
-    setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
-    setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
-    setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
+      { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
   }
 
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
@@ -976,6 +988,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
+
   // Register based DivRem for AEABI (RTABI 4.2)
   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
@@ -984,29 +997,49 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
 
-    for (const auto &LC :
-         {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
-      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
-                                                      : "__aeabi_idivmod");
-    setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
-                                           ? "__rt_sdiv64"
-                                           : "__aeabi_ldivmod");
-    for (const auto &LC :
-         {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
-      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
-                                                      : "__aeabi_uidivmod");
-    setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
-                                           ? "__rt_udiv64"
-                                           : "__aeabi_uldivmod");
-
-    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+    if (Subtarget->isTargetWindows()) {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+        { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
+
+        { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    } else {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+        { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
+
+        { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    }
 
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
@@ -3305,11 +3338,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
-  case Intrinsic::arm_rbit: {
-    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
-           "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
-  }
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -9232,12 +9260,102 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
   return SDValue();
 }
 
-// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
-// (only after legalization).
-static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+static bool IsVUZPShuffleNode(SDNode *N) {
+  // VUZP shuffle node.
+  if (N->getOpcode() == ARMISD::VUZP)
+    return true;
+
+  // "VUZP" on i32 is an alias for VTRN.
+  if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
+    return true;
+
+  return false;
+}
+
+static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
+  // Look for ADD(VUZP.0, VUZP.1).
+  if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
+      N0 == N1)
+   return SDValue();
+
+  // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
+  if (!N->getValueType(0).is64BitVector())
+    return SDValue();
 
+  // Generate vpadd.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  SDNode *Unzip = N0.getNode();
+  EVT VT = N->getValueType(0);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+  Ops.push_back(Unzip->getOperand(0));
+  Ops.push_back(Unzip->getOperand(1));
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // Check for two extended operands.
+  if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
+        N1.getOpcode() == ISD::SIGN_EXTEND) &&
+      !(N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N1.getOpcode() == ISD::ZERO_EXTEND))
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+
+  // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
+  if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
+      N00 == N10)
+    return SDValue();
+
+  // We only recognize Q register paddl here; this can't be reached until
+  // after type legalization.
+  if (!N00.getValueType().is64BitVector() ||
+      !N0.getValueType().is128BitVector())
+    return SDValue();
+
+  // Generate vpaddl.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  SmallVector<SDValue, 8> Ops;
+  // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
+  unsigned Opcode;
+  if (N0.getOpcode() == ISD::SIGN_EXTEND)
+    Opcode = Intrinsic::arm_neon_vpaddls;
+  else
+    Opcode = Intrinsic::arm_neon_vpaddlu;
+  Ops.push_back(DAG.getConstant(Opcode, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+  EVT ElemTy = N00.getValueType().getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
+                               N00.getOperand(0), N00.getOperand(1));
+  Ops.push_back(Concat);
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
+// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
+// much easier to match.
+static SDValue
+AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const ARMSubtarget *Subtarget) {
   // Only perform optimization if after legalize, and if NEON is available. We
   // also expected both operands to be BUILD_VECTORs.
   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
@@ -9293,6 +9411,10 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       return SDValue();
   }
 
+  // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
+  if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
+    return SDValue();
+
   // Create VPADDL node.
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -9564,9 +9686,15 @@ static SDValue PerformADDCCombine(SDNode *N,
 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const ARMSubtarget *Subtarget){
+  // Attempt to create vpadd for this add.
+  if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
+    return Result;
 
   // Attempt to create vpaddl for this add.
-  if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
+  if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
+    return Result;
+  if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
+                                                      Subtarget))
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 5255d82d647a..7a7f91f4d3c4 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -16,16 +16,28 @@
 #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetLowering.h"
-#include <vector>
+#include <utility>
 
 namespace llvm {
-  class ARMConstantPoolValue;
-  class ARMSubtarget;
+
+class ARMSubtarget;
+class InstrItineraryData;
 
   namespace ARMISD {
+
     // ARM Specific DAG Nodes
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
@@ -217,12 +229,15 @@ namespace llvm {
       VST3LN_UPD,
       VST4LN_UPD
     };
-  }
+
+  } // end namespace ARMISD
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
+
     bool isBitFieldInvertedMask(unsigned v);
-  }
+
+  } // end namespace ARM
 
   //===--------------------------------------------------------------------===//
   //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
@@ -531,6 +546,7 @@ namespace llvm {
     std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+
     void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
                           SDValue &Arg, RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
@@ -623,6 +639,7 @@ namespace llvm {
       return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
           MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
     }
+
     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
     void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
@@ -644,9 +661,8 @@ namespace llvm {
                               unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
-    SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
@@ -712,9 +728,12 @@ namespace llvm {
   };
 
   namespace ARM {
+
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
-  }
-}
 
-#endif  // ARMISELLOWERING_H
+  } // end namespace ARM
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 9bd036a1eace..324087d670b5 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -29,7 +29,33 @@ using namespace llvm;
 // into an ARMGenRegisterBankInfo.def (similar to AArch64).
 namespace llvm {
 namespace ARM {
-RegisterBank GPRRegBank;
+const uint32_t GPRCoverageData[] = {
+    // Classes 0-31
+    (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
+        (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
+        (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
+        (1u << ARM::GPRnopc_and_hGPRRegClassID) |
+        (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
+        (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
+        (1u << ARM::hGPR_and_tcGPRRegClassID),
+    // Classes 32-63
+    0,
+    // Classes 64-96
+    0,
+    // FIXME: Some of the entries below this point can be safely removed once
+    // this is tablegenerated. It's only needed because of the hardcoded
+    // register class limit.
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData);
 RegisterBank *RegBanks[] = {&GPRRegBank};
 
 RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
@@ -51,14 +77,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
     return;
   AlreadyInit = true;
 
-  // Initialize the GPR bank.
-  createRegisterBank(ARM::GPRRegBankID, "GPRB");
-
-  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI);
-  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI);
   const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
   (void)RBGPR;
   assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+
+  // Initialize the GPR bank.
   assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc001b596785..2b6b36bc3e68 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -433,7 +433,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 int ARMTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 731a5adf3d73..3c83cd92a61a 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,7 +114,8 @@ public:
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
diff --git a/lib/Target/Lanai/LanaiTargetTransformInfo.h b/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7fcb3ce45bbb..d95c16fc3caf 100644
--- a/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
     switch (ISD) {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 26e0f9a94368..f28e8b36fdbc 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -14,11 +14,13 @@
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -1456,9 +1458,12 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   return Result;
 }
 
-static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
-  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op),
-                         Op->getValueType(0));
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG,
+                                bool IsSigned = false) {
+  return DAG.getConstant(
+      APInt(Op->getValueType(0).getScalarType().getSizeInBits(),
+            Op->getConstantOperandVal(ImmOp), IsSigned),
+      SDLoc(Op), Op->getValueType(0));
 }
 
 static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
@@ -1564,8 +1569,8 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
-
-  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue();
+  switch (Intrinsic) {
   default:
     return SDValue();
   case Intrinsic::mips_shilo:
@@ -1635,6 +1640,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
+    if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+      report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
                                        Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1648,6 +1655,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
+    if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+      report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
                                       Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1741,7 +1750,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_ceqi_w:
   case Intrinsic::mips_ceqi_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETEQ);
   case Intrinsic::mips_cle_s_b:
   case Intrinsic::mips_cle_s_h:
   case Intrinsic::mips_cle_s_w:
@@ -1753,7 +1762,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_clei_s_w:
   case Intrinsic::mips_clei_s_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLE);
   case Intrinsic::mips_cle_u_b:
   case Intrinsic::mips_cle_u_h:
   case Intrinsic::mips_cle_u_w:
@@ -1777,7 +1786,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_clti_s_w:
   case Intrinsic::mips_clti_s_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLT);
   case Intrinsic::mips_clt_u_b:
   case Intrinsic::mips_clt_u_h:
   case Intrinsic::mips_clt_u_w:
@@ -1990,15 +1999,28 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_insve_b:
   case Intrinsic::mips_insve_h:
   case Intrinsic::mips_insve_w:
-  case Intrinsic::mips_insve_d:
+  case Intrinsic::mips_insve_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_insve_b: Max = 15; break;
+    case Intrinsic::mips_insve_h: Max = 7; break;
+    case Intrinsic::mips_insve_w: Max = 3; break;
+    case Intrinsic::mips_insve_d: Max = 1; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
     return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
                        DAG.getConstant(0, DL, MVT::i32));
+    }
   case Intrinsic::mips_ldi_b:
   case Intrinsic::mips_ldi_h:
   case Intrinsic::mips_ldi_w:
   case Intrinsic::mips_ldi_d:
-    return lowerMSASplatImm(Op, 1, DAG);
+    return lowerMSASplatImm(Op, 1, DAG, true);
   case Intrinsic::mips_lsa:
   case Intrinsic::mips_dlsa: {
     EVT ResTy = Op->getValueType(0);
@@ -2032,7 +2054,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_maxi_s_w:
   case Intrinsic::mips_maxi_s_d:
     return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
-                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_maxi_u_b:
   case Intrinsic::mips_maxi_u_h:
   case Intrinsic::mips_maxi_u_w:
@@ -2056,7 +2078,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_mini_s_w:
   case Intrinsic::mips_mini_s_d:
     return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
-                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_mini_u_b:
   case Intrinsic::mips_mini_u_h:
   case Intrinsic::mips_mini_u_w:
@@ -2129,11 +2151,59 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_pcnt_w:
   case Intrinsic::mips_pcnt_d:
     return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_sat_s_b:
+  case Intrinsic::mips_sat_s_h:
+  case Intrinsic::mips_sat_s_w:
+  case Intrinsic::mips_sat_s_d:
+  case Intrinsic::mips_sat_u_b:
+  case Intrinsic::mips_sat_u_h:
+  case Intrinsic::mips_sat_u_w:
+  case Intrinsic::mips_sat_u_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_sat_s_b:
+    case Intrinsic::mips_sat_u_b: Max = 7;  break;
+    case Intrinsic::mips_sat_s_h:
+    case Intrinsic::mips_sat_u_h: Max = 15; break;
+    case Intrinsic::mips_sat_s_w:
+    case Intrinsic::mips_sat_u_w: Max = 31; break;
+    case Intrinsic::mips_sat_s_d:
+    case Intrinsic::mips_sat_u_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_shf_b:
   case Intrinsic::mips_shf_h:
-  case Intrinsic::mips_shf_w:
+  case Intrinsic::mips_shf_w: {
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > 255)
+      report_fatal_error("Immediate out of range");
     return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
                        Op->getOperand(2), Op->getOperand(1));
+  }
+  case Intrinsic::mips_sldi_b:
+  case Intrinsic::mips_sldi_h:
+  case Intrinsic::mips_sldi_w:
+  case Intrinsic::mips_sldi_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_sldi_b: Max = 15; break;
+    case Intrinsic::mips_sldi_h: Max = 7; break;
+    case Intrinsic::mips_sldi_w: Max = 3; break;
+    case Intrinsic::mips_sldi_d: Max = 1; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(3))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_sll_b:
   case Intrinsic::mips_sll_h:
   case Intrinsic::mips_sll_w:
@@ -2176,6 +2246,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_srai_d:
     return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srari_b:
+  case Intrinsic::mips_srari_h:
+  case Intrinsic::mips_srari_w:
+  case Intrinsic::mips_srari_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_srari_b: Max = 7; break;
+    case Intrinsic::mips_srari_h: Max = 15; break;
+    case Intrinsic::mips_srari_w: Max = 31; break;
+    case Intrinsic::mips_srari_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_srl_b:
   case Intrinsic::mips_srl_h:
   case Intrinsic::mips_srl_w:
@@ -2188,6 +2276,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_srli_d:
     return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srlri_b:
+  case Intrinsic::mips_srlri_h:
+  case Intrinsic::mips_srlri_w:
+  case Intrinsic::mips_srlri_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_srlri_b: Max = 7; break;
+    case Intrinsic::mips_srlri_h: Max = 15; break;
+    case Intrinsic::mips_srlri_w: Max = 31; break;
+    case Intrinsic::mips_srlri_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_subv_b:
   case Intrinsic::mips_subv_h:
   case Intrinsic::mips_subv_w:
@@ -2219,7 +2325,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 }
 
-static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+                                const MipsSubtarget &Subtarget) {
   SDLoc DL(Op);
   SDValue ChainIn = Op->getOperand(0);
   SDValue Address = Op->getOperand(2);
@@ -2227,6 +2334,12 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
   EVT ResTy = Op->getValueType(0);
   EVT PtrTy = Address->getValueType(0);
 
+  // For N64 addresses have the underlying type MVT::i64. This intrinsic
+  // however takes an i32 signed constant offset. The actual type of the
+  // intrinsic is a scaled signed i10.
+  if (Subtarget.isABI_N64())
+    Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
   return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
                      /* Alignment = */ 16);
@@ -2282,11 +2395,12 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::mips_ld_h:
   case Intrinsic::mips_ld_w:
   case Intrinsic::mips_ld_d:
-   return lowerMSALoadIntr(Op, DAG, Intr);
+   return lowerMSALoadIntr(Op, DAG, Intr, Subtarget);
   }
 }
 
-static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+                                 const MipsSubtarget &Subtarget) {
   SDLoc DL(Op);
   SDValue ChainIn = Op->getOperand(0);
   SDValue Value   = Op->getOperand(2);
@@ -2294,6 +2408,12 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
   SDValue Offset  = Op->getOperand(4);
   EVT PtrTy = Address->getValueType(0);
 
+  // For N64 addresses have the underlying type MVT::i64. This intrinsic
+  // however takes an i32 signed constant offset. The actual type of the
+  // intrinsic is a scaled signed i10.
+  if (Subtarget.isABI_N64())
+    Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
 
   return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
@@ -2310,7 +2430,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::mips_st_h:
   case Intrinsic::mips_st_w:
   case Intrinsic::mips_st_d:
-    return lowerMSAStoreIntr(Op, DAG, Intr);
+    return lowerMSAStoreIntr(Op, DAG, Intr, Subtarget);
   }
 }
 
@@ -3377,8 +3497,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   unsigned Wd = MI.getOperand(0).getReg();
   unsigned Fs = MI.getOperand(1).getReg();
-  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
-  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt1 = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                              : &Mips::MSA128WEvensRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                              : &Mips::MSA128WEvensRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
   BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
index a2d670f8d39d..7fc0156216f5 100644
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -27,7 +27,8 @@ class ManagedStringPool {
   SmallVector<std::string *, 8> Pool;
 
 public:
-  ManagedStringPool() {}
+  ManagedStringPool() = default;
+
   ~ManagedStringPool() {
     SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
     while (Current != Pool.end()) {
@@ -43,6 +44,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 04c8d5c0443e..3c2594c77f45 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -12,42 +12,83 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXAsmPrinter.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
-#include "NVPTXInstrInfo.h"
+#include "NVPTXAsmPrinter.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
 #include "cl_common_defines.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <new>
 #include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEPOTNAME "__local_depot"
@@ -62,11 +103,11 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
               cl::desc("NVPTX Specific: Emit source line in ptx file"),
               cl::init(false));
 
-namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
 /// depends.
-void DiscoverDependentGlobals(const Value *V,
-                              DenseSet<const GlobalVariable *> &Globals) {
+static void
+DiscoverDependentGlobals(const Value *V,
+                         DenseSet<const GlobalVariable *> &Globals) {
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Globals.insert(GV);
   else {
@@ -80,11 +121,12 @@ void DiscoverDependentGlobals(const Value *V,
 
 /// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
 /// instances to be emitted, but only after any dependents have been added
-/// first.
-void VisitGlobalVariableForEmission(
-    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
-    DenseSet<const GlobalVariable *> &Visited,
-    DenseSet<const GlobalVariable *> &Visiting) {
+/// first.s
+static void
+VisitGlobalVariableForEmission(const GlobalVariable *GV,
+                               SmallVectorImpl<const GlobalVariable *> &Order,
+                               DenseSet<const GlobalVariable *> &Visited,
+                               DenseSet<const GlobalVariable *> &Visiting) {
   // Have we already visited this one?
   if (Visited.count(GV))
     return;
@@ -108,7 +150,6 @@ void VisitGlobalVariableForEmission(
   Visited.insert(GV);
   Visiting.erase(GV);
 }
-}
 
 void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
   if (!EmitLineNumbers)
@@ -369,7 +410,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
     } else if (Ty->isAggregateType() || Ty->isVectorTy()) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
-      if (!llvm::getAlign(*F, 0, retAlignment))
+      if (!getAlign(*F, 0, retAlignment))
         retAlignment = DL.getABITypeAlignment(Ty);
       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
         << "]";
@@ -401,7 +442,6 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
     }
   }
   O << ") ";
-  return;
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
@@ -459,7 +499,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   MRI = &MF->getRegInfo();
   F = MF->getFunction();
   emitLinkageDirective(F, O);
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     O << ".entry ";
   else {
     O << ".func ";
@@ -470,7 +510,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 
   emitFunctionParamList(*MF, O);
 
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     emitKernelFunctionDirectives(*F, O);
 
   OutStreamer->EmitRawText(O.str());
@@ -513,15 +553,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // If none of reqntid* is specified, don't output reqntid directive.
   unsigned reqntidx, reqntidy, reqntidz;
   bool specified = false;
-  if (!llvm::getReqNTIDx(F, reqntidx))
+  if (!getReqNTIDx(F, reqntidx))
     reqntidx = 1;
   else
     specified = true;
-  if (!llvm::getReqNTIDy(F, reqntidy))
+  if (!getReqNTIDy(F, reqntidy))
     reqntidy = 1;
   else
     specified = true;
-  if (!llvm::getReqNTIDz(F, reqntidz))
+  if (!getReqNTIDz(F, reqntidz))
     reqntidz = 1;
   else
     specified = true;
@@ -535,15 +575,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // If none of maxntid* is specified, don't output maxntid directive.
   unsigned maxntidx, maxntidy, maxntidz;
   specified = false;
-  if (!llvm::getMaxNTIDx(F, maxntidx))
+  if (!getMaxNTIDx(F, maxntidx))
     maxntidx = 1;
   else
     specified = true;
-  if (!llvm::getMaxNTIDy(F, maxntidy))
+  if (!getMaxNTIDy(F, maxntidy))
     maxntidy = 1;
   else
     specified = true;
-  if (!llvm::getMaxNTIDz(F, maxntidz))
+  if (!getMaxNTIDz(F, maxntidz))
     maxntidz = 1;
   else
     specified = true;
@@ -553,11 +593,11 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
       << "\n";
 
   unsigned mincta;
-  if (llvm::getMinCTASm(F, mincta))
+  if (getMinCTASm(F, mincta))
     O << ".minnctapersm " << mincta << "\n";
 
   unsigned maxnreg;
-  if (llvm::getMaxNReg(F, maxnreg))
+  if (getMaxNReg(F, maxnreg))
     O << ".maxnreg " << maxnreg << "\n";
 }
 
@@ -617,12 +657,9 @@ void NVPTXAsmPrinter::printVecModifiedImmediate(
     llvm_unreachable("Unknown Modifier on immediate operand");
 }
 
-
-
 void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
-
   emitLinkageDirective(F, O);
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     O << ".entry ";
   else
     O << ".func ";
@@ -684,7 +721,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
   if (!gv->hasInternalLinkage())
     return false;
   PointerType *Pty = gv->getType();
-  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+  if (Pty->getAddressSpace() != ADDRESS_SPACE_SHARED)
     return false;
 
   const Function *oneFunc = nullptr;
@@ -699,7 +736,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
 }
 
 static bool useFuncSeen(const Constant *C,
-                        llvm::DenseMap<const Function *, bool> &seenMap) {
+                        DenseMap<const Function *, bool> &seenMap) {
   for (const User *U : C->users()) {
     if (const Constant *cu = dyn_cast<Constant>(U)) {
       if (useFuncSeen(cu, seenMap))
@@ -719,7 +756,7 @@ static bool useFuncSeen(const Constant *C,
 }
 
 void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
-  llvm::DenseMap<const Function *, bool> seenMap;
+  DenseMap<const Function *, bool> seenMap;
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = &*FI;
 
@@ -1040,7 +1077,6 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
 void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
                                          raw_ostream &O,
                                          bool processDemoted) {
-
   // Skip meta data
   if (GVar->hasSection()) {
     if (GVar->getSection() == "llvm.metadata")
@@ -1069,13 +1105,13 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     O << ".weak ";
   }
 
-  if (llvm::isTexture(*GVar)) {
-    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+  if (isTexture(*GVar)) {
+    O << ".global .texref " << getTextureName(*GVar) << ";\n";
     return;
   }
 
-  if (llvm::isSurface(*GVar)) {
-    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+  if (isSurface(*GVar)) {
+    O << ".global .surfref " << getSurfaceName(*GVar) << ";\n";
     return;
   }
 
@@ -1088,8 +1124,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     return;
   }
 
-  if (llvm::isSampler(*GVar)) {
-    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+  if (isSampler(*GVar)) {
+    O << ".global .samplerref " << getSamplerName(*GVar);
 
     const Constant *Initializer = nullptr;
     if (GVar->hasInitializer())
@@ -1150,12 +1186,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   }
 
   if (GVar->hasPrivateLinkage()) {
-
-    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+    if (strncmp(GVar->getName().data(), "unrollpragma", 12) == 0)
       return;
 
     // FIXME - need better way (e.g. Metadata) to avoid generating this global
-    if (!strncmp(GVar->getName().data(), "filename", 8))
+    if (strncmp(GVar->getName().data(), "filename", 8) == 0)
       return;
     if (GVar->use_empty())
       return;
@@ -1199,8 +1234,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     // Ptx allows variable initilization only for constant and global state
     // spaces.
     if (GVar->hasInitializer()) {
-      if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+      if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) {
         const Constant *Initializer = GVar->getInitializer();
         // 'undef' is treated as there is no value specified.
         if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
@@ -1233,8 +1268,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       ElementSize = DL.getTypeStoreSize(ETy);
       // Ptx allows variable initilization only for constant and
       // global state spaces.
-      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+      if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+           (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
           GVar->hasInitializer()) {
         const Constant *Initializer = GVar->getInitializer();
         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
@@ -1285,7 +1320,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     default:
       llvm_unreachable("type not supported yet");
     }
-
   }
   O << ";\n";
 }
@@ -1305,16 +1339,16 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
 void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
                                           raw_ostream &O) const {
   switch (AddressSpace) {
-  case llvm::ADDRESS_SPACE_LOCAL:
+  case ADDRESS_SPACE_LOCAL:
     O << "local";
     break;
-  case llvm::ADDRESS_SPACE_GLOBAL:
+  case ADDRESS_SPACE_GLOBAL:
     O << "global";
     break;
-  case llvm::ADDRESS_SPACE_CONST:
+  case ADDRESS_SPACE_CONST:
     O << "const";
     break;
-  case llvm::ADDRESS_SPACE_SHARED:
+  case ADDRESS_SPACE_SHARED:
     O << "shared";
     break;
   default:
@@ -1363,7 +1397,6 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
                                             raw_ostream &O) {
-
   const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
@@ -1406,7 +1439,6 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   default:
     llvm_unreachable("type not supported yet");
   }
-  return;
 }
 
 static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
@@ -1450,7 +1482,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
-  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isKernelFunc = isKernelFunction(*F);
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
   MVT thePointerTy = TLI->getPointerTy(DL);
 
@@ -1533,13 +1565,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             default:
               O << ".ptr ";
               break;
-            case llvm::ADDRESS_SPACE_CONST:
+            case ADDRESS_SPACE_CONST:
               O << ".ptr .const ";
               break;
-            case llvm::ADDRESS_SPACE_SHARED:
+            case ADDRESS_SPACE_SHARED:
               O << ".ptr .shared ";
               break;
-            case llvm::ADDRESS_SPACE_GLOBAL:
+            case ADDRESS_SPACE_GLOBAL:
               O << ".ptr .global ";
               break;
             }
@@ -1820,7 +1852,6 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) {
 
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
-
   const DataLayout &DL = getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
@@ -1985,7 +2016,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
 // buildTypeNameMap - Run through symbol table looking for type names.
 //
 
-
 bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -2100,7 +2130,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     raw_string_ostream OS(S);
     OS << "Unsupported expression in static initializer: ";
     CE->printAsOperand(OS, /*PrintType=*/ false,
-                       !MF ? 0 : MF->getFunction()->getParent());
+                       !MF ? nullptr : MF->getFunction()->getParent());
     report_fatal_error(OS.str());
   }
 
@@ -2330,7 +2360,7 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
                                       raw_ostream &O, const char *Modifier) {
   printOperand(MI, opNum, O);
 
-  if (Modifier && !strcmp(Modifier, "add")) {
+  if (Modifier && strcmp(Modifier, "add") == 0) {
     O << ", ";
     printOperand(MI, opNum + 1, O);
   } else {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 3dcc0e358a14..8ec3476b8719 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,17 +18,34 @@
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Value.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
 #include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
 // The ptx syntax and format is very different from that usually seem in a .s
 // file,
@@ -40,7 +57,8 @@
 // (subclass of MCStreamer).
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
 
 class LineReader {
 private:
@@ -49,14 +67,17 @@ private:
   char buff[512];
   std::string theFileName;
   SmallVector<unsigned, 32> lineOffset;
+
 public:
   LineReader(std::string filename) {
     theCurLine = 0;
     fstr.open(filename.c_str());
     theFileName = filename;
   }
-  std::string fileName() { return theFileName; }
+
   ~LineReader() { fstr.close(); }
+
+  std::string fileName() { return theFileName; }
   std::string readLine(unsigned line);
 };
 
@@ -107,6 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       numSymbols = 0;
       EmitGeneric = AP.EmitGeneric;
     }
+
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
       assert((curpos + Num) <= size);
       assert((curpos + Bytes) <= size);
@@ -120,6 +142,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
+
     unsigned addZeros(int Num) {
       assert((curpos + Num) <= size);
       for (int i = 0; i < Num; ++i) {
@@ -128,12 +151,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
+
     void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) {
       symbolPosInBuffer.push_back(curpos);
       Symbols.push_back(GVar);
       SymbolsBeforeStripping.push_back(GVarBeforeStripping);
       numSymbols++;
     }
+
     void print() {
       if (numSymbols == 0) {
         // print out in bytes
@@ -267,7 +292,7 @@ private:
   std::map<Type *, std::string> TypeNameMap;
 
   // List of variables demoted to a function scope.
-  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+  std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
 
   // To record filename to ID mapping
   std::map<std::string, unsigned> filenameMap;
@@ -292,7 +317,8 @@ private:
 
   bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
 
-  LineReader *reader;
+  LineReader *reader = nullptr;
+
   LineReader *getReader(const std::string &);
 
   // Used to control the need to emit .generic() in the initializer of
@@ -312,20 +338,17 @@ public:
   NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)),
         EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
-                    NVPTX::CUDA) {
-    CurrentBankselLabelInBasicBlock = "";
-    reader = nullptr;
-  }
+                    NVPTX::CUDA) {}
 
-  ~NVPTXAsmPrinter() {
-    if (!reader)
-      delete reader;
+  ~NVPTXAsmPrinter() override {
+    delete reader;
   }
 
   bool runOnMachineFunction(MachineFunction &F) override {
     nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
     return AsmPrinter::runOnMachineFunction(F);
   }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>();
     AsmPrinter::getAnalysisUsage(AU);
@@ -338,6 +361,7 @@ public:
   DebugLoc prevDebugLoc;
   void emitLineNumberAsDotLoc(const MachineInstr &);
 };
-} // end of namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2e4764feff11..7a760fd38d0f 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1,3 +1,4 @@
+//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,31 +12,55 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXISelLowering.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXSection.h"
+#include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 #include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "nvptx-lower"
@@ -109,7 +134,6 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                                          const NVPTXSubtarget &STI)
     : TargetLowering(TM), nvTM(&TM), STI(STI) {
-
   // always lower memset, memcpy, and memmove intrinsics to load/store
   // instructions, rather
   // then generating calls to memset, mempcy or memmove.
@@ -981,7 +1005,7 @@ std::string NVPTXTargetLowering::getPrototype(
         unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
         // +1 because index 0 is reserved for return type alignment
-        if (!llvm::getAlign(*CallI, i + 1, align))
+        if (!getAlign(*CallI, i + 1, align))
           align = DL.getABITypeAlignment(Ty);
         unsigned sz = DL.getTypeAllocSize(Ty);
         O << ".param .align " << align << " .b8 ";
@@ -1047,7 +1071,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
     // With bitcast'd call targets, the instruction will be the call
     if (isa<CallInst>(CalleeI)) {
       // Check if we have call alignment metadata
-      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
         return Align;
 
       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
@@ -1070,7 +1094,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
   // Check for function alignment information if we found that the
   // ultimate target is a Function
   if (DirectCallee)
-    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
       return Align;
 
   // Call is indirect or alignment information is not available, fall back to
@@ -1747,7 +1771,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} >> Amt
     //   dHi = aHi >> Amt
@@ -1761,7 +1784,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
     return DAG.getMergeValues(Ops, dl);
   }
   else {
-
     // {dHi, dLo} = {aHi, aLo} >> Amt
     // - if (Amt>=size) then
     //      dLo = aHi >> (Amt-size)
@@ -1809,7 +1831,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShAmt  = Op.getOperand(2);
 
   if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} << Amt
     //   dHi = shf.l.clamp aLo, aHi, Amt
@@ -1823,7 +1844,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
     return DAG.getMergeValues(Ops, dl);
   }
   else {
-
     // {dHi, dLo} = {aHi, aLo} << Amt
     // - if (Amt>=size) then
     //      dLo = aLo << Amt (all 0)
@@ -2002,11 +2022,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case 2:
       Opcode = NVPTXISD::StoreV2;
       break;
-    case 4: {
+    case 4:
       Opcode = NVPTXISD::StoreV4;
       break;
     }
-    }
 
     SmallVector<SDValue, 8> Ops;
 
@@ -2140,7 +2159,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                      : nullptr))) {
-      assert(llvm::isKernelFunction(*F) &&
+      assert(isKernelFunction(*F) &&
              "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
       continue;
@@ -2193,7 +2212,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                            0);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         bool aggregateIsPacked = false;
-        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+        if (StructType *STy = dyn_cast<StructType>(Ty))
           aggregateIsPacked = STy->isPacked();
 
         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
@@ -2202,7 +2221,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           EVT partVT = vtparts[parti];
           Value *srcValue = Constant::getNullValue(
               PointerType::get(partVT.getTypeForEVT(F->getContext()),
-                               llvm::ADDRESS_SPACE_PARAM));
+                               ADDRESS_SPACE_PARAM));
           SDValue srcAddr =
               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                           DAG.getConstant(offsets[parti], dl, PtrVT));
@@ -2242,7 +2261,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         if (NumElts == 1) {
           // We only have one element, so just directly load it
           Value *SrcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
               EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
               DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
@@ -2260,7 +2279,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           // f32,f32 = load ...
           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
           Value *SrcValue = Constant::getNullValue(PointerType::get(
-              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+              VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
               VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
               DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
@@ -2301,7 +2320,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           for (unsigned i = 0; i < NumElts; i += VecSize) {
             Value *SrcValue = Constant::getNullValue(
                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
-                                 llvm::ADDRESS_SPACE_PARAM));
+                                 ADDRESS_SPACE_PARAM));
             SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                                           DAG.getConstant(Ofst, dl, PtrVT));
             SDValue P = DAG.getLoad(
@@ -2335,7 +2354,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       // If ABI, load from the param symbol
       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       Value *srcValue = Constant::getNullValue(PointerType::get(
-          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
       SDValue p;
        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
@@ -2424,7 +2443,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
                                       DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
-
     } else if (NumElts == 2) {
       // V2 store
       SDValue StoreVal0 = OutVals[0];
@@ -2558,7 +2576,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
 }
 
-
 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -3306,7 +3323,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = true;
     Info.align = 0;
@@ -3326,7 +3343,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
       Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3347,7 +3364,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
       Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3410,17 +3427,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
     Info.opc = getOpcForTextureInstr(Intrinsic);
     Info.memVT = MVT::v4f32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
@@ -3532,17 +3549,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
     Info.opc = getOpcForTextureInstr(Intrinsic);
     Info.memVT = MVT::v4i32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i8_clamp:
   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
@@ -3587,17 +3604,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   case Intrinsic::nvvm_suld_3d_i8_zero:
   case Intrinsic::nvvm_suld_3d_v2i8_zero:
-  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i8;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i16_clamp:
   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
@@ -3642,17 +3659,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   case Intrinsic::nvvm_suld_3d_i16_zero:
   case Intrinsic::nvvm_suld_3d_v2i16_zero:
-  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i16;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i32_clamp:
   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
@@ -3697,17 +3714,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   case Intrinsic::nvvm_suld_3d_i32_zero:
   case Intrinsic::nvvm_suld_3d_v2i32_zero:
-  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i64_clamp:
   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
@@ -3737,18 +3754,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   case Intrinsic::nvvm_suld_3d_i64_zero:
-  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i64;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
   }
-  }
   return false;
 }
 
@@ -3760,7 +3776,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                                 const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
-
   // AddrMode - This represents an addressing mode of:
   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
   //
@@ -4059,7 +4074,7 @@ static SDValue PerformANDCombine(SDNode *N,
     }
 
     bool AddTo = false;
-    if (AExt.getNode() != 0) {
+    if (AExt.getNode() != nullptr) {
       // Re-insert the ext as a zext.
       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
                             AExt.getValueType(), Val);
@@ -4204,7 +4219,6 @@ static bool IsMulWideOperandDemotable(SDValue Op,
 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
                                         unsigned OptSize,
                                         bool &IsSigned) {
-
   OperandSignedness LHSSign;
 
   // The LHS operand must be a demotable op
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 92a88c7f2506..0fbb0448e4c4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,7 +144,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 
-def true : Predicate<"1">;
+def true : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index cad4f5668fdf..b0472de980fc 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -1,4 +1,4 @@
-//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,18 +14,20 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
 
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
 
 namespace llvm {
+
 /// Represents a section in PTX PTX does not have sections. We create this class
 /// in order to use the ASMPrint interface.
 ///
 class NVPTXSection final : public MCSection {
   virtual void anchor();
+
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
-  ~NVPTXSection() {}
+  ~NVPTXSection() = default;
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
@@ -40,4 +42,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 6c68a2c9370d..eb357e0a4d50 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -11,41 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXTargetMachine.h"
-#include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -57,6 +44,7 @@ static cl::opt<bool>
                                cl::init(false), cl::Hidden);
 
 namespace llvm {
+
 void initializeNVVMIntrRangePass(PassRegistry&);
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
@@ -66,7 +54,8 @@ void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
-}
+
+} // end namespace llvm
 
 extern "C" void LLVMInitializeNVPTXTarget() {
   // Register the target.
@@ -109,7 +98,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
                         Reloc::PIC_, CM, OL),
       is64bit(is64bit),
-      TLOF(make_unique<NVPTXTargetObjectFile>()),
+      TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
@@ -118,7 +107,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-NVPTXTargetMachine::~NVPTXTargetMachine() {}
+NVPTXTargetMachine::~NVPTXTargetMachine() = default;
 
 void NVPTXTargetMachine32::anchor() {}
 
@@ -141,6 +130,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 namespace {
+
 class NVPTXPassConfig : public TargetPassConfig {
 public:
   NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
@@ -170,6 +160,7 @@ private:
   // Add passes that perform straight-line scalar optimizations.
   void addStraightLineScalarOptimizationPasses();
 };
+
 } // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index dc367a90594a..69c59d0296ab 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -11,14 +11,13 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
 
 #include "NVPTXSection.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-class GlobalVariable;
-class Module;
 
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
-
 public:
   NVPTXTargetObjectFile() {
     TextSection = nullptr;
@@ -43,7 +42,7 @@ public:
     DwarfMacinfoSection = nullptr;
   }
 
-  virtual ~NVPTXTargetObjectFile();
+  ~NVPTXTargetObjectFile() override;
 
   void Initialize(MCContext &ctx, const TargetMachine &TM) override {
     TargetLoweringObjectFile::Initialize(ctx, TM);
@@ -52,7 +51,6 @@ public:
     BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
     ReadOnlySection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
-
     StaticCtorSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
     StaticDtorSection =
@@ -102,4 +100,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 48928ee2d540..dd7707084948 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d953aa8a7199..b6c271ae4cbc 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 };
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f7785342b364..f94d1eab097d 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -281,7 +281,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8308086ccfaa..30ee2814aba1 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -71,7 +71,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2081809def70..2d0a06af18ae 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -547,8 +547,26 @@ bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
   assert (isa<LoadInst>(I) || isa<StoreInst>(I));
   Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
                        I->getOperand(0)->getType());
-  if (!isUInt<12>(Offset) &&
-      (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+  bool IsFPAccess = MemAccessTy->isFloatingPointTy();
+  bool IsVectorAccess = MemAccessTy->isVectorTy();
+
+  // A store of an extracted vector element will be combined into a VSTE type
+  // instruction.
+  if (!IsVectorAccess && isa<StoreInst>(I)) {
+    Value *DataOp = I->getOperand(0);
+    if (isa<ExtractElementInst>(DataOp))
+      IsVectorAccess = true;
+  }
+
+  // A load which gets inserted into a vector element will be combined into a
+  // VLE type instruction.
+  if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
+    User *LoadUser = *I->user_begin();
+    if (isa<InsertElementInst>(LoadUser))
+      IsVectorAccess = true;
+  }
+
+  if (!isUInt<12>(Offset) && (IsFPAccess || IsVectorAccess))
     return false;
 
   return true;
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index e16ced1661a1..8a6d28490e8c 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -44,7 +44,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
                              const TargetOptions &Options)
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
-      RequireStructuredCFG(false), Options(Options) {
+      RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) {
   if (EnableIPRA.getNumOccurrences())
     this->Options.EnableIPRA = EnableIPRA;
 }
@@ -63,14 +63,15 @@ bool TargetMachine::isPositionIndependent() const {
 /// \brief Reset the target options based on the function's attributes.
 // FIXME: This function needs to go away for a number of reasons:
 // a) global state on the TargetMachine is terrible in general,
-// b) there's no default state here to keep,
-// c) these target options should be passed only on the function
+// b) these target options should be passed only on the function
 //    and not on the TargetMachine (via TargetOptions) at all.
 void TargetMachine::resetTargetOptions(const Function &F) const {
 #define RESET_OPTION(X, Y)                                                     \
   do {                                                                         \
     if (F.hasFnAttribute(Y))                                                   \
       Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");          \
+    else                                                                       \
+      Options.X = DefaultOptions.X;                                            \
   } while (0)
 
   RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
@@ -87,6 +88,8 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
     Options.FPDenormalMode = FPDenormal::PreserveSign;
   else if (Denormal == "positive-zero")
     Options.FPDenormalMode = FPDenormal::PositiveZero;
+  else
+    Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
 }
 
 /// Returns the code generation relocation model. The choices are static, PIC,
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 529540ea4ed2..bc7020fded8c 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -663,6 +663,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
   for (auto const &Arg : F->args())
     MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
 
+  if (!F->getReturnType()->isVoidTy())
+    MFI->addResult(getLegalType(getSimpleType(F->getReturnType())));
+
   return true;
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index d5474a02ce01..adf904ee0269 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -62,12 +62,19 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
 // Recursively descend the def-use lists from V to find non-bitcast users of
 // bitcasts of V.
 static void FindUses(Value *V, Function &F,
-                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses,
+                     SmallPtrSetImpl<Constant *> &ConstantBCs) {
   for (Use &U : V->uses()) {
     if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
-      FindUses(BC, F, Uses);
-    else if (U.get()->getType() != F.getType())
+      FindUses(BC, F, Uses, ConstantBCs);
+    else if (U.get()->getType() != F.getType()) {
+      if (isa<Constant>(U.get())) {
+        // Only add constant bitcasts to the list once; they get RAUW'd
+        auto c = ConstantBCs.insert(cast<Constant>(U.get()));
+        if (!c.second) continue;
+      }
       Uses.push_back(std::make_pair(&U, &F));
+    }
   }
 }
 
@@ -122,10 +129,10 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
 
 bool FixFunctionBitcasts::runOnModule(Module &M) {
   SmallVector<std::pair<Use *, Function *>, 0> Uses;
+  SmallPtrSet<Constant *, 2> ConstantBCs;
 
   // Collect all the places that need wrappers.
-  for (Function &F : M)
-    FindUses(&F, F, Uses);
+  for (Function &F : M) FindUses(&F, F, Uses, ConstantBCs);
 
   DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index bf546dab5fbb..47aadf99e860 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
 
   unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
       Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2a2e3941f82d..f658609f8930 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index dc18a59a30ba..83a23d4ad680 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
                                      "HasSlowDivide32", "true",
                                      "Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
                                      "HasSlowDivide64", "true",
-                                     "Use 16-bit divide for positive values less than 65536">;
+                                     "Use 32-bit divide for positive values less than 2^32">;
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
@@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureCMPXCHG16B,
   FeaturePOPCNT,
   FeatureAES,
+  FeatureSlowDivide64,
   FeaturePCLMUL,
   FeatureXSAVE,
   FeatureXSAVEOPT,
@@ -760,6 +761,42 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
+// TODO: The scheduler model falls to BTVER2 model.
+// The znver1 model has to be put in place.
+// Zen
+def: ProcessorModel<"znver1", BtVer2Model, [
+  FeatureADX,
+  FeatureAES,
+  FeatureAVX2,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureCLFLUSHOPT,
+  FeatureCMPXCHG16B,
+  FeatureF16C,
+  FeatureFMA,
+  FeatureFSGSBase,
+  FeatureFXSR,
+  FeatureFastLZCNT,
+  FeatureLAHFSAHF,
+  FeatureLZCNT,
+  FeatureMMX,
+  FeatureMOVBE,
+  FeatureMWAITX,
+  FeaturePCLMUL,
+  FeaturePOPCNT,
+  FeaturePRFCHW,
+  FeatureRDRAND,
+  FeatureRDSEED,
+  FeatureSHA,
+  FeatureSMAP,
+  FeatureSSE4A,
+  FeatureSlowSHLD,
+  FeatureX87,
+  FeatureXSAVE,
+  FeatureXSAVEC,
+  FeatureXSAVEOPT,
+  FeatureXSAVES]>;
+
 def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8b66790679d9..8ab4c0616880 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -183,16 +183,6 @@ namespace {
 
     void PreprocessISelDAG() override;
 
-    inline bool immSext8(SDNode *N) const {
-      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
-    }
-
-    // True if the 64-bit immediate fits in a 32-bit sign-extended field.
-    inline bool i64immSExt32(SDNode *N) const {
-      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
-      return (int64_t)v == (int32_t)v;
-    }
-
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index db76ddf04c06..787dff99367e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
-  // Bypass expensive divides on Atom when compiling with O2.
+  // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOpt::Default) {
     if (Subtarget.hasSlowDivide32())
       addBypassSlowDiv(32, 8);
     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
-      addBypassSlowDiv(64, 16);
+      addBypassSlowDiv(64, 32);
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
@@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
 
       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
@@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
-    if (Subtarget.hasDQI()) {
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
-    }
+
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,     VT, Legal);
       setOperationAction(ISD::FCEIL,      VT, Legal);
@@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   return Zeroable;
 }
 
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+                                     ArrayRef<int> Mask,const EVT &VectorType,
+                                     bool &IsZeroSideLeft) {
+  int NextElement = -1;
+  // Check if the Mask's nonzero elements are in increasing order.
+  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+    // Checks if the mask's zeros elements are built from only zeros.
+    if (Mask[i] == -1)
+      return false;
+    if (Zeroable[i])
+      continue;
+    // Find the lowest non zero element
+    if (NextElement == -1) {
+      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+      IsZeroSideLeft = NextElement != 0;
+    }
+    // Exit if the mask's non zero elements are not in increasing order.
+    if (NextElement != Mask[i])
+      return false;
+    NextElement++;
+  }
+  return true;
+}
+
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+  unsigned convertBit = 0;
+  for (int i = 0, e = Zeroable.size(); i < e; i++)
+    convertBit |= !(Zeroable[i]) << i;
+  return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+                                          const SmallBitVector &Zeroable,
+                                          ArrayRef<int> Mask, SDValue &V1,
+                                          SDValue &V2, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
+  bool IsLeftZeroSide = true;
+  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+                                IsLeftZeroSide))
+    return SDValue();
+  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  MVT IntegerType =
+      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+         "Unexpected number of vector elements");
+  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+                              Subtarget, DAG, DL);
+  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                     ZeroVector);
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
@@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or VEXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
@@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or EXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
@@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
@@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
@@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
-    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
@@ -15187,13 +15295,13 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   MVT InVT = In.getSimpleValueType();
   SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
-    return SDValue();
 
-  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
+      (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
 
-  assert(InVT.getVectorElementType() == MVT::i1);
+  if (InVT.getVectorElementType() != MVT::i1)
+    return SDValue();
 
   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
   MVT ExtVT = VT;
@@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       }
   }
 
+  // Sometimes flags can be set either with an AND or with an SRL/SHL
+  // instruction. SRL/SHL variant should be preferred for masks longer than this
+  // number of bits.
+  const int ShiftToAndMaxMaskWidth = 32;
+  const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   // which may be the result of a CAST.  We use the variable 'Op', which is the
   // non-casted variable when we check for possible users.
@@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+    if (ZeroCheck && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
@@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       APInt Mask = ArithOp.getOpcode() == ISD::SRL
                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
-      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
         break;
       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
                        DAG.getConstant(Mask, dl, VT));
@@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
 
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
+    // because a TEST instruction will be better. However, AND should be
+    // preferred if the instruction can be combined into ANDN.
     if (!hasNonFlagsUse(Op)) {
       SDValue Op0 = ArithOp->getOperand(0);
       SDValue Op1 = ArithOp->getOperand(1);
       EVT VT = ArithOp.getValueType();
       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+      bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+      // If we cannot select an ANDN instruction, check if we can replace
+      // AND+IMM64 with a shift before giving up. This is possible for masks
+      // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+      if (!isProperAndn) {
+        if (!ZeroCheck)
+          break;
+
+        assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+        auto *CN = dyn_cast<ConstantSDNode>(Op1);
+        if (!CN)
+          break;
+
+        const APInt &Mask = CN->getAPIntValue();
+        if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+          break; // Prefer TEST instruction.
+
+        unsigned BitWidth = Mask.getBitWidth();
+        unsigned LeadingOnes = Mask.countLeadingOnes();
+        unsigned TrailingZeros = Mask.countTrailingZeros();
+
+        if (LeadingOnes + TrailingZeros == BitWidth) {
+          assert(TrailingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+          break;
+        }
+
+        unsigned LeadingZeros = Mask.countLeadingZeros();
+        unsigned TrailingOnes = Mask.countTrailingOnes();
+
+        if (LeadingZeros + TrailingOnes == BitWidth) {
+          assert(LeadingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+          break;
+        }
 
-      // But if we can combine this into an ANDN operation, then create an AND
-      // now and allow it to be pattern matched into an ANDN.
-      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
         break;
+      }
     }
     LLVM_FALLTHROUGH;
   case ISD::SUB:
@@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
-      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+      if (!NeedTruncation && ZeroCheck) {
         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
           return EFLAGS;
       }
@@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   unsigned NumElts = VT.getVectorNumElements();
 
-  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
-    return SDValue();
-
-  if (VT.is512BitVector() && InVTElt != MVT::i1) {
+  if (VT.is512BitVector() && InVTElt != MVT::i1 &&
+      (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   }
 
-  assert (InVTElt == MVT::i1 && "Unexpected vector type");
-  MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+  if (InVTElt != MVT::i1)
+    return SDValue();
+
+  MVT ExtVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX())
+    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
   SDValue V;
   if (Subtarget.hasDQI()) {
     V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
@@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
     SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
     V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
-    if (VT.is512BitVector())
+    if (ExtVT == VT)
       return V;
   }
 
@@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
-    SDValue Op0 = ShAmt.getOperand(0);
-    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
-    ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+    ShAmt = ShAmt.getOperand(0);
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
+    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   } else {
     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   if (VT == MVT::v16i8 ||
-      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+      (VT == MVT::v64i8 && Subtarget.hasBWI())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
-      // On SSE41 targets we make use of the fact that VSELECT lowers
-      // to PBLENDVB which selects bytes based just on the sign bit.
-      if (Subtarget.hasSSE41()) {
+      if (VT.is512BitVector()) {
+        // On AVX512BW targets we make use of the fact that VSELECT lowers
+        // to a masked blend which selects bytes based just on the sign bit
+        // extracted to a mask.
+        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      } else if (Subtarget.hasSSE41()) {
+        // On SSE41 targets we make use of the fact that VSELECT lowers
+        // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
@@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() != ISD::VSELECT)
     return SDValue();
 
+  assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
-  // Check if the first operand is all zeros.This situation only
-  // applies to avx512.
-  if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+  // Check if the first operand is all zeros and Cond type is vXi1.
+  // This situation only applies to avx512.
+  if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+      CondVT.getVectorElementType() == MVT::i1) {
       //Invert the cond to not(cond) : xor(op,allones)=not(op)
       SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-        DAG.getConstant(1, DL, Cond.getValueType()));
+        DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
+                        DL, CondVT));
       //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
       return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
   }
-  assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
   // To use the condition operand as a bitwise mask, it must have elements that
   // are the same size as the select elements. Ie, the condition operand must
@@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Combine:
+/// Combine brcond/cmov/setcc/.. based on comparing the result of
+/// atomic_load_add to use EFLAGS produced by the addition
+/// directly if possible. For example:
+///
+///   (setcc (cmp (atomic_load_add x, -C) C), COND_E)
+/// becomes:
+///   (setcc (LADD x, -C), COND_E)
+///
+/// and
 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
-/// to:
+/// becomes:
 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
-/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+///
 /// Note that this is only legal for some op/cc combinations.
 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
                                        SelectionDAG &DAG) {
@@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
-  // This only applies to variations of the common case:
+  // Can't replace the cmp if it has more uses than the one we're looking at.
+  // FIXME: We would like to be able to handle this, but would need to make sure
+  // all uses were updated.
+  if (!Cmp.hasOneUse())
+    return SDValue();
+
+  // This applies to variations of the common case:
   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
     return SDValue();
 
   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
-  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+  if (!CmpRHSC)
     return SDValue();
+  APInt Comparand = CmpRHSC->getAPIntValue();
 
   const unsigned Opc = CmpLHS.getOpcode();
 
@@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
   if (Opc == ISD::ATOMIC_LOAD_SUB)
     Addend = -Addend;
 
-  if (CC == X86::COND_S && Addend == 1)
+  if (Comparand == -Addend) {
+    // No change to CC.
+  } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
     CC = X86::COND_LE;
-  else if (CC == X86::COND_NS && Addend == 1)
+  } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
     CC = X86::COND_G;
-  else if (CC == X86::COND_G && Addend == -1)
+  } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
     CC = X86::COND_GE;
-  else if (CC == X86::COND_LE && Addend == -1)
+  } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
     CC = X86::COND_L;
-  else
+  } else {
     return SDValue();
+  }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
@@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
 
 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
 /// is valid for the given \p Subtarget.
-static bool
-isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+                                        const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX512())
     return false;
+
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+    return false;
+
   EVT SrcElVT = SrcVT.getScalarType();
   EVT DstElVT = DstVT.getScalarType();
   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
@@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
   return false;
 }
 
+/// Return true if VPACK* instruction can be used for the given types
+/// and it is avalable on \p Subtarget.
+static bool
+isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+  if (Subtarget.hasSSE2())
+    // v16i16 -> v16i8
+    if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
+      return true;
+  if (Subtarget.hasSSE41())
+    // v8i32 -> v8i16
+    if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
+      return true;
+  return false;
+}
+
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched or the unsupported on the current target.
-static SDValue
-detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
   if (In.getOpcode() != ISD::UMIN)
     return SDValue();
 
-  EVT InVT = In.getValueType();
-  // FIXME: Scalar type may be supported if we move it to vector register.
-  if (!InVT.isVector() || !InVT.isSimple())
-    return SDValue();
-
-  if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
-    return SDValue();
-
   //Saturation with truncation. We truncate from InVT to VT.
-  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
     "Unexpected types for truncate operation");
 
-  SDValue SrcVal;
   APInt C;
-  if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
-    SrcVal = In.getOperand(1);
-  else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
-    SrcVal = In.getOperand(0);
-  else
+  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+    // the element size of the destination type.
+    return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
+      SDValue();
+  }
+  return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+                                       const X86Subtarget &Subtarget) {
+  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
+  return detectUSatPattern(In, VT);
+}
 
-  // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
-  // the element size of the destination type.
-  return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
-    SrcVal : SDValue();
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+                        const X86Subtarget &Subtarget) {
+  SDValue USatVal = detectUSatPattern(In, VT);
+  if (USatVal) {
+    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+    if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
+      return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
+    }
+  }
+  return SDValue();
 }
 
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
@@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getMemOperand()->getFlags());
 
     if (SDValue Val =
-        detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
@@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
-  // Try the truncation with unsigned saturation.
-  if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
-    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+  // Try to combine truncation with unsigned saturation.
+  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+    return Val;
 
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d44d1395f243..230d1700b8d2 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in {
             (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
 } // Predicates = [HasAVX512]
 
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+} // Predicates = [HasAVX512]
+
 // Convert float/double to signed/unsigned int 32/64 with truncation
 multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDNode OpNode,
@@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)),
           (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
                     (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
            Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+          (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+          Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+          (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+          Requires<[HasAVX512]>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  Vector convert from signed/unsigned integer to float/double
 //          and from float/double to signed/unsigned integer
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 09971d586a41..1812d01711d1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
   InstrItinClass ri = arg_ri;
 }
 
-
 // scalar
 let Sched = WriteFAdd in {
 def SSE_ALU_F32S : OpndItins<
@@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
 }
 } // isCodeGenOnly = 1
 
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 92c16214aa4a..d80dc4a9b5e8 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -216,7 +216,7 @@ protected:
   /// 32-bit divisions and should be used when possible.
   bool HasSlowDivide32;
 
-  /// True if 16-bit divides are significantly faster than
+  /// True if 32-bit divides are significantly faster than
   /// 64-bit divisions and should be used when possible.
   bool HasSlowDivide64;
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 107ed9359376..5715d826862e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
-    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry SLMCostTable[] = {
+    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
+    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
+    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+    { ISD::FMUL, MVT::f64,   2  }, // mulsd
+    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
+    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
+    { ISD::FDIV, MVT::f32,   17 }, // divss
+    { ISD::FDIV, MVT::v4f32, 39 }, // divps
+    { ISD::FDIV, MVT::f64,   32 }, // divsd
+    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+    { ISD::FADD, MVT::v2f64, 2  }, // addpd
+    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
+    // v2i64/v4i64 mul is custom lowered as a series of long
+    // multiplies(3), shifts(3) and adds(2).
+    // slm muldq version throughput is 2
+    { ISD::MUL,  MVT::v2i64, 11 },
+  };
+
+  if (ST->isSLM()) {
+    if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+      // Check if the operands can be shrinked into a smaller datatype.
+      bool Op1Signed = false;
+      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+      bool Op2Signed = false;
+      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+      bool signedMode = Op1Signed | Op2Signed;
+      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+      if (OpMinSize <= 7)
+        return LT.first * 3; // pmullw/sext
+      if (!signedMode && OpMinSize <= 8)
+        return LT.first * 3; // pmullw/zext
+      if (OpMinSize <= 15)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+      if (!signedMode && OpMinSize <= 16)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+    }
+    if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+                                            LT.second)) {
+      return LT.first * Entry->Cost;
+    }
+  }
+
   if (ISD == ISD::SDIV &&
       Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
 
+    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
+
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index c013805f4321..ecaaf951cff7 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 82daf754be0d..deb7e819480b 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -270,12 +270,12 @@ class LowerTypeTestsModule {
     /// relative to the start address.
     Constant *AlignLog2;
 
-    /// ByteArray, Inline, AllOnes: size of the memory region covering members
-    /// of this type identifier as a multiple of 2^AlignLog2.
-    Constant *Size;
+    /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+    /// covering members of this type identifier as a multiple of 2^AlignLog2.
+    Constant *SizeM1;
 
-    /// ByteArray, Inline, AllOnes: range of the size expressed as a bit width.
-    unsigned SizeBitWidth;
+    /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width.
+    unsigned SizeM1BitWidth;
 
     /// ByteArray: the byte array to test the address against.
     Constant *TheByteArray;
@@ -593,8 +593,8 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                      IntPtrTy));
   Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
 
-  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.Size, IntPtrTy);
-  Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
+  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy);
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst);
 
   // If the bit set is all ones, testing against it is unnecessary.
   if (TIL.TheKind == TypeTestResolution::AllOnes)
@@ -711,13 +711,13 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     if (BSI.isAllOnes()) {
       TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
                                        : TypeTestResolution::AllOnes;
-      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
-      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
-                                  BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+                                    BSI.BitSize - 1);
     } else if (BSI.BitSize <= 64) {
       TIL.TheKind = TypeTestResolution::Inline;
-      TIL.SizeBitWidth = (BSI.BitSize <= 32) ? 5 : 6;
-      TIL.Size = ConstantInt::get(Int8Ty, BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6;
+      TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1);
       uint64_t InlineBits = 0;
       for (auto Bit : BSI.Bits)
         InlineBits |= uint64_t(1) << Bit;
@@ -728,9 +728,9 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
             (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
     } else {
       TIL.TheKind = TypeTestResolution::ByteArray;
-      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
-      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
-                                  BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+                                    BSI.BitSize - 1);
       ++NumByteArraysCreated;
       ByteArrayInfo *BAI = createByteArray(BSI);
       TIL.TheByteArray = BAI->ByteArray;
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 55151c13b430..2d34c1cc74bd 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1371,15 +1371,9 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
           SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
     return replaceInstUsesWith(I, V);
 
-  if (isa<Constant>(RHS)) {
-    if (isa<PHINode>(LHS))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
-
-    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
-      if (Instruction *NV = FoldOpIntoSelect(I, SI))
-        return NV;
-  }
+  if (isa<Constant>(RHS))
+    if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
+      return FoldedFAdd;
 
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a59b43d6af5f..da5384a86aac 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1382,13 +1382,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
@@ -2125,14 +2120,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
                             Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   // Given an OR instruction, check to see if this is a bswap.
@@ -2594,13 +2583,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index b29ed3c87451..2ef82ba3ed8c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1599,21 +1599,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // fma fneg(x), fneg(y), z -> fma x, y, z
     if (match(Src0, m_FNeg(m_Value(LHS))) &&
         match(Src1, m_FNeg(m_Value(RHS)))) {
-      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
-                                              {LHS, RHS, II->getArgOperand(2)});
-      NewCall->takeName(II);
-      NewCall->copyFastMathFlags(II);
-      return replaceInstUsesWith(*II, NewCall);
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
     }
 
     // fma fabs(x), fabs(x), z -> fma x, x, z
     if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
         match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
-      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
-                                              {LHS, LHS, II->getArgOperand(2)});
-      NewCall->takeName(II);
-      NewCall->copyFastMathFlags(II);
-      return replaceInstUsesWith(*II, NewCall);
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
     }
 
     // fma x, 1, z -> fadd x, z
@@ -2760,6 +2756,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (KnownOne.isAllOnesValue())
       return eraseInstFromFunction(*II);
 
+    // Update the cache of affected values for this assumption (we might be
+    // here because we just simplified the condition).
+    AC.updateAffectedValues(II);
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3cefe715e567..2847ce858e79 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -320,7 +320,6 @@ private:
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
                             SmallVectorImpl<Value *> &NewIndices);
-  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// Classify whether a cast is worth optimizing.
   ///
@@ -537,13 +536,21 @@ private:
   Value *SimplifyVectorOp(BinaryOperator &Inst);
   Value *SimplifyBSwap(BinaryOperator &Inst);
 
-  // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
-  // which has a PHI node as operand #0, see if we can fold the instruction
-  // into the PHI (which is only possible if all operands to the PHI are
-  // constants).
-  //
+
+  /// Given a binary operator, cast instruction, or select which has a PHI node
+  /// as operand #0, see if we can fold the instruction into the PHI (which is
+  /// only possible if all operands to the PHI are constants).
   Instruction *FoldOpIntoPhi(Instruction &I);
 
+  /// Given an instruction with a select as one operand and a constant as the
+  /// other operand, try to fold the binary operator into the select arguments.
+  /// This also works for Cast instructions, which obviously do not have a
+  /// second operand.
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// This is a convenience wrapper function for the above two functions.
+  Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+
   /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
   Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index ac64671725f3..45a19fb0f1f2 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -267,14 +267,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
-    // Try to fold constant mul into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
 
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     {
@@ -626,14 +620,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
-    // Try to fold constant mul into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
 
     // (fmul X, -1.0) --> (fsub -0.0, X)
     if (match(Op1, m_SpecificFP(-1.0))) {
@@ -956,14 +944,9 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
         }
       }
 
-      if (*C2 != 0) { // avoid X udiv 0
-        if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-          if (Instruction *R = FoldOpIntoSelect(I, SI))
-            return R;
-        if (isa<PHINode>(Op0))
-          if (Instruction *NV = FoldOpIntoPhi(I))
-            return NV;
-      }
+      if (*C2 != 0) // avoid X udiv 0
+        if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
+          return FoldedDiv;
     }
   }
 
@@ -1443,6 +1426,16 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     }
   }
 
+  Value *LHS;
+  Value *RHS;
+
+  // -x / -y -> x / y
+  if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
+    I.setOperand(0, LHS);
+    I.setOperand(1, RHS);
+    return &I;
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 184897f751fe..4cbffe9533b7 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -29,7 +29,7 @@ using namespace llvm::PatternMatch;
 /// locations of the original PHI node arguments.
 DebugLoc InstCombiner::PHIArgMergedDebugLoc(PHINode &PN) {
   auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
-  DILocation *Loc = FirstInst->getDebugLoc();
+  const DILocation *Loc = FirstInst->getDebugLoc();
 
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     auto *I = cast<Instruction>(PN.getIncomingValue(i));
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 5ad2a1c0e3e6..4ff9b64ac57c 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -530,13 +530,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
         return BinaryOperator::CreateMul(BO->getOperand(0),
                                          ConstantExpr::getShl(BOOp, Op1));
 
-  // Try to fold constant and into select arguments.
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-    if (Instruction *R = FoldOpIntoSelect(I, SI))
-      return R;
-  if (isa<PHINode>(Op0))
-    if (Instruction *NV = FoldOpIntoPhi(I))
-      return NV;
+  if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+    return FoldedShift;
 
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
   if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9a52874c4c21..27fc34d23175 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -770,10 +770,6 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   return RI;
 }
 
-/// Given an instruction with a select as one operand and a constant as the
-/// other operand, try to fold the binary operator into the select arguments.
-/// This also works for Cast instructions, which obviously do not have a second
-/// operand.
 Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   // Don't modify shared select instructions.
   if (!SI->hasOneUse())
@@ -824,9 +820,6 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
 }
 
-/// Given a binary operator, cast instruction, or select which has a PHI node as
-/// operand #0, see if we can fold the instruction into the PHI (which is only
-/// possible if all operands to the PHI are constants).
 Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *PN = cast<PHINode>(I.getOperand(0));
   unsigned NumPHIValues = PN->getNumIncomingValues();
@@ -964,6 +957,19 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return replaceInstUsesWith(I, NewPN);
 }
 
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+  assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+
+  if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+      return NewSel;
+  } else if (isa<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = FoldOpIntoPhi(I))
+      return NewPhi;
+  }
+  return nullptr;
+}
+
 /// Given a pointer type and a constant offset, determine whether or not there
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 54bdc9e0772b..9c4b417e35e1 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1598,8 +1598,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
                       IntptrTy, IntptrTy, IntptrTy, nullptr);
   unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(GlobalStructTy);
-  assert((isPowerOf2_32(SizeOfGlobalStruct) ||
-          !TargetTriple.isOSBinFormatCOFF()) &&
+  assert(isPowerOf2_32(SizeOfGlobalStruct) &&
          "global metadata will not be padded appropriately");
   SmallVector<Constant *, 16> Initializers(UseMetadataArray ? n : 0);
 
@@ -1766,13 +1765,11 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
                              GlobalValue::getRealLinkageName(G->getName()));
     Metadata->setSection(getGlobalMetadataSection());
 
+    // We don't want any padding, but we also need a reasonable alignment.
     // The MSVC linker always inserts padding when linking incrementally. We
     // cope with that by aligning each struct to its size, which must be a power
     // of two.
-    if (TargetTriple.isOSBinFormatCOFF())
-      Metadata->setAlignment(SizeOfGlobalStruct);
-    else
-      Metadata->setAlignment(1); // Don't leave padding in between.
+    Metadata->setAlignment(SizeOfGlobalStruct);
 
     // On platforms that support comdats, put the metadata and the
     // instrumented global in the same group. This ensures that the metadata
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 8da3e31200f3..adea7e772447 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -32,6 +32,11 @@ cl::opt<bool> DoNameCompression("enable-name-compression",
                                 cl::desc("Enable name string compression"),
                                 cl::init(true));
 
+cl::opt<bool> DoHashBasedCounterSplit(
+    "hash-based-counter-split",
+    cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+    cl::init(true));
+
 cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
     cl::desc("Do static counter allocation for value profiler"),
@@ -272,7 +277,16 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
   StringRef NamePrefix = getInstrProfNameVarPrefix();
   StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
-  return (Prefix + Name).str();
+  Function *F = Inc->getParent()->getParent();
+  Module *M = F->getParent();
+  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+      !canRenameComdatFunc(*F))
+    return (Prefix + Name).str();
+  uint64_t FuncHash = Inc->getHash()->getZExtValue();
+  SmallVector<char, 24> HashPostfix;
+  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+    return (Prefix + Name).str();
+  return (Prefix + Name + "." + Twine(FuncHash)).str();
 }
 
 static inline bool shouldRecordFunctionAddr(Function *F) {
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 28f4f7ea1455..04f9a64bef9f 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -119,7 +119,7 @@ static cl::opt<unsigned> MaxNumAnnotations(
 // Command line option to control appending FunctionHash to the name of a COMDAT
 // function. This is to avoid the hash mismatch caused by the preinliner.
 static cl::opt<bool> DoComdatRenaming(
-    "do-comdat-renaming", cl::init(true), cl::Hidden,
+    "do-comdat-renaming", cl::init(false), cl::Hidden,
     cl::desc("Append function hash to the name of COMDAT function to avoid "
              "function hash mismatch due to the preinliner"));
 
@@ -134,6 +134,12 @@ static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
 static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
                                        cl::Hidden);
 
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
+                                             cl::init(true), cl::Hidden);
+
 // Command line option to enable/disable select instruction instrumentation.
 static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
                                     cl::Hidden);
@@ -407,21 +413,9 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
 static bool canRenameComdat(
     Function &F,
     std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
-  if (F.getName().empty())
-    return false;
-  if (!needsComdatForCounter(F, *(F.getParent())))
-    return false;
-  // Only safe to do if this function may be discarded if it is not used
-  // in the compilation unit.
-  if (!GlobalValue::isDiscardableIfUnused(F.getLinkage()))
+  if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
     return false;
 
-  // For AvailableExternallyLinkage functions.
-  if (!F.hasComdat()) {
-    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
-    return true;
-  }
-
   // FIXME: Current only handle those Comdat groups that only containing one
   // function and function aliases.
   // (1) For a Comdat group containing multiple functions, we need to have a
@@ -803,7 +797,11 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
       } else if (Err == instrprof_error::hash_mismatch ||
                  Err == instrprof_error::malformed) {
         NumOfPGOMismatch++;
-        SkipWarning = NoPGOWarnMismatch;
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
       }
 
       if (SkipWarning)
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 56df77f03028..06d3d6a73954 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -13,10 +13,12 @@ add_llvm_library(LLVMScalarOpts
   GuardWidening.cpp
   GVN.cpp
   GVNHoist.cpp
+  IVUsersPrinter.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
+  LoopAccessAnalysisPrinter.cpp
   LoopSink.cpp
   LoadCombine.cpp
   LoopDeletion.cpp
@@ -26,6 +28,7 @@ add_llvm_library(LLVMScalarOpts
   LoopInstSimplify.cpp
   LoopInterchange.cpp
   LoopLoadElimination.cpp
+  LoopPassManager.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
   LoopSimplifyCFG.cpp
diff --git a/lib/Transforms/Scalar/IVUsersPrinter.cpp b/lib/Transforms/Scalar/IVUsersPrinter.cpp
new file mode 100644
index 000000000000..807593379283
--- /dev/null
+++ b/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -0,0 +1,22 @@
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U) {
+  AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 68faa886060a..1752fb75eb1b 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,15 +25,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -49,6 +47,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -2096,7 +2096,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
   } else {
     // In any other case, convert both IVInit and IVCount to integers before
-    // comparing. This may result in SCEV expension of pointers, but in practice
+    // comparing. This may result in SCEV expansion of pointers, but in practice
     // SCEV will fold the pointer arithmetic away as such:
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
@@ -2482,23 +2482,13 @@ bool IndVarSimplify::run(Loop *L) {
   return Changed;
 }
 
-PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &) {
   Function *F = L.getHeader()->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-
-  assert((LI && SE && DT) &&
-         "Analyses required for indvarsimplify not available!");
-
-  // Optional analyses.
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-
-  IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6ef9d0561322..4c15c8a32bec 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -41,8 +41,8 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -61,6 +61,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -84,14 +85,17 @@ static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  const LoopSafetyInfo *SafetyInfo);
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo);
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
                                            const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
                                            const Instruction *CtxI = nullptr);
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
                                      const AAMDNodes &AAInfo,
@@ -104,7 +108,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 namespace {
 struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE, bool DeleteAST);
+                 TargetLibraryInfo *TLI, ScalarEvolution *SE,
+                 OptimizationRemarkEmitter *ORE, bool DeleteAST);
 
   DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
     return LoopToAliasSetMap;
@@ -135,12 +140,16 @@ struct LegacyLICMPass : public LoopPass {
     }
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
     return LICM.runOnLoop(L,
                           &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                          SE ? &SE->getSE() : nullptr, false);
+                          SE ? &SE->getSE() : nullptr, &ORE, false);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -176,21 +185,20 @@ private:
 };
 }
 
-PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM) {
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
   const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
 
-  auto *AA = FAM.getCachedResult<AAManager>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  assert((AA && LI && DT && TLI && SE) && "Analyses for LICM not available");
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
+                       "cached at a higher level");
 
   LoopInvariantCodeMotion LICM;
-
-  if (!LICM.runOnLoop(&L, AA, LI, DT, TLI, SE, true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
     return PreservedAnalyses::all();
 
   // FIXME: There is no setPreservesCFG in the new PM. When that becomes
@@ -217,7 +225,9 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
                                         LoopInfo *LI, DominatorTree *DT,
                                         TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE, bool DeleteAST) {
+                                        ScalarEvolution *SE,
+                                        OptimizationRemarkEmitter *ORE,
+                                        bool DeleteAST) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -243,10 +253,10 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
   //
   if (L->hasDedicatedExits())
     Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                          CurAST, &SafetyInfo);
+                          CurAST, &SafetyInfo, ORE);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                           CurAST, &SafetyInfo);
+                           CurAST, &SafetyInfo, ORE);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -279,7 +289,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
       for (AliasSet &AS : *CurAST)
         Promoted |=
             promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
-                                         TLI, L, CurAST, &SafetyInfo);
+                                         TLI, L, CurAST, &SafetyInfo, ORE);
 
       // Once we have promoted values across the loop body we have to
       // recursively reform LCSSA as any nested loop may now have values defined
@@ -320,7 +330,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                      OptimizationRemarkEmitter *ORE) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
@@ -336,7 +347,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
   bool Changed = false;
   const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
-    Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+    Changed |=
+        sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
 
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
@@ -363,9 +375,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     // operands of the instruction are loop invariant.
     //
     if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
-        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo)) {
+        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
       ++II;
-      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
     }
   }
   return Changed;
@@ -378,7 +390,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                       OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
@@ -417,16 +430,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // is safe to hoist the instruction.
       //
       if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
           isSafeToExecuteUnconditionally(
-              I, DT, CurLoop, SafetyInfo,
+              I, DT, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator()))
-        Changed |= hoist(I, DT, CurLoop, SafetyInfo);
+        Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
     }
 
   const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
-    Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+    Changed |=
+        hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
   return Changed;
 }
 
@@ -465,7 +479,8 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
-                              LoopSafetyInfo *SafetyInfo) {
+                              LoopSafetyInfo *SafetyInfo,
+                              OptimizationRemarkEmitter *ORE) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
@@ -486,7 +501,17 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     AAMDNodes AAInfo;
     LI->getAAMetadata(AAInfo);
 
-    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+    bool Invalidated =
+        pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+    // Check loop-invariant address because this may also be a sinkable load
+    // whose address is not necessarily loop-invariant.
+    if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+                << "failed to move load with loop-invariant address "
+                   "because the loop may invalidate its value");
+
+    return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Don't sink or hoist dbg info; it's legal, but not useful.
     if (isa<DbgInfoIntrinsic>(I))
@@ -680,8 +705,11 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 ///
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo) {
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+            << "sinking " << ore::NV("Inst", &I));
   bool Changed = false;
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
@@ -748,10 +776,13 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  const LoopSafetyInfo *SafetyInfo) {
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE) {
   auto *Preheader = CurLoop->getLoopPreheader();
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
+            << "hosting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -786,15 +817,28 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
 /// Only sink or hoist an instruction if it is not a trapping instruction,
 /// or if the instruction is known not to trap when moved to the preheader.
 /// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
                                            const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
                                            const Instruction *CtxI) {
   if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
     return true;
 
-  return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+  bool GuaranteedToExecute =
+      isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+
+  if (!GuaranteedToExecute) {
+    auto *LI = dyn_cast<LoadInst>(&Inst);
+    if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+                << "failed to hoist load with loop-invariant address "
+                   "because load is conditionally executed");
+  }
+
+  return GuaranteedToExecute;
 }
 
 namespace {
@@ -882,7 +926,8 @@ bool llvm::promoteLoopAccessesToScalars(
     AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          CurAST != nullptr && SafetyInfo != nullptr &&
@@ -982,14 +1027,14 @@ bool llvm::promoteLoopAccessesToScalars(
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+      if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
         if (!Load->isSimple())
           return false;
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
-              *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator());
+              *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
@@ -1074,6 +1119,9 @@ bool llvm::promoteLoopAccessesToScalars(
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
                << '\n');
+  ORE->emit(
+      OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
+      << "Moving accesses to memory location out of the loop");
   ++NumPromoted;
 
   // Grab a debug location for the inserted loads/stores; given that the
diff --git a/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
new file mode 100644
index 000000000000..a64c99117d64
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -0,0 +1,25 @@
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  Function &F = *L.getHeader()->getParent();
+  auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS.indent(2) << L.getHeader()->getName() << ":\n";
+  LAI.print(OS, 4);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 187e6e3073c7..cca75a365024 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -19,9 +19,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -215,15 +215,10 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   return Changed;
 }
 
-PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto &DT = *FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto &SE = *FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto &LI = *FAM.getCachedResult<LoopAnalysis>(*F);
-
-  bool Changed = runImpl(&L, DT, SE, LI);
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+                                        LoopStandardAnalysisResults &AR,
+                                        LPMUpdater &) {
+  bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
   if (!Changed)
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index b2b2f72aa83d..19716b28ad66 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -31,13 +31,13 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -946,10 +946,18 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
+  // We don't directly need these analyses but they're required for loop
+  // analyses so provide them below.
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   std::function<const LoopAccessInfo &(Loop &)> GetLAA =
       [&](Loop &L) -> const LoopAccessInfo & {
-    return LAM.getResult<LoopAccessAnalysis>(L);
+    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+    return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
   bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2743574ecca6..5fec51c095d0 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -61,6 +60,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -186,24 +186,12 @@ public:
 };
 } // End anonymous namespace.
 
-PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L,
-                                              LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  // Use getCachedResult because Loop pass cannot trigger a function analysis.
-  auto *AA = FAM.getCachedResult<AAManager>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
-  assert((AA && DT && LI && SE && TLI && TTI && DL) &&
-         "Analyses for Loop Idiom Recognition not available");
 
-  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index f6620ad1ade5..69102d10ff60 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -26,6 +25,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -183,20 +183,10 @@ public:
 };
 }
 
-PreservedAnalyses LoopInstSimplifyPass::run(Loop &L,
-                                            LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  // Use getCachedResult because Loop pass cannot trigger a function analysis.
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
-  const auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  assert((LI && AC && TLI) && "Analyses for Loop Inst Simplify not available");
-
-  if (!SimplifyLoopInst(&L, DT, LI, AC, TLI))
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/lib/Transforms/Scalar/LoopPassManager.cpp b/lib/Transforms/Scalar/LoopPassManager.cpp
new file mode 100644
index 000000000000..028f4bba8b1d
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -0,0 +1,85 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class PassManager<Loop, LoopAnalysisManager,
+                           LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+            LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
+  if (DebugLogging)
+    dbgs() << "Starting Loop pass manager run.\n";
+
+  for (auto &Pass : Passes) {
+    if (DebugLogging)
+      dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+    PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+
+    // If the loop was deleted, abort the run and return to the outer walk.
+    if (U.skipCurrentLoop()) {
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+    // Update the analysis manager as each pass runs and potentially
+    // invalidates analyses.
+    AM.invalidate(L, PassPA);
+
+    // Finally, we intersect the final preserved analyses to compute the
+    // aggregate preserved set for this pass manager.
+    PA.intersect(std::move(PassPA));
+
+    // FIXME: Historically, the pass managers all called the LLVM context's
+    // yield function here. We don't have a generic way to acquire the
+    // context and it isn't yet clear what the right pattern is for yielding
+    // in the new pass manager so it is currently omitted.
+    // ...getContext().yield();
+  }
+
+  // Invalidation for the current loop should be handled above, and other loop
+  // analysis results shouldn't be impacted by runs over this loop. Therefore,
+  // the remaining analysis results in the AnalysisManager are preserved. We
+  // mark this with a set so that we don't need to inspect each one
+  // individually.
+  // FIXME: This isn't correct! This loop and all nested loops' analyses should
+  // be preserved, but unrolling should invalidate the parent loop's analyses.
+  PA.preserveSet<AllAnalysesOn<Loop>>();
+
+  if (DebugLogging)
+    dbgs() << "Finished Loop pass manager run.\n";
+
+  return PA;
+}
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+    : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                     LoopStandardAnalysisResults &,
+                                     LPMUpdater &) {
+  printLoop(L, OS, Banner);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 0225cc325700..cc83069d5f52 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -14,13 +14,12 @@
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -34,6 +33,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -625,20 +625,11 @@ bool LoopRotate::processLoop(Loop *L) {
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
     : EnableHeaderDuplication(EnableHeaderDuplication) {}
 
-PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
-  assert((LI && TTI && AC) && "Analyses for loop rotation not available");
-
-  // Optional analyses.
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
   int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
-  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE);
+  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
 
   bool Changed = LR.processLoop(&L);
   if (!Changed)
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index d37339fc5fee..16061212ba38 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -18,18 +18,18 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -64,16 +64,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
   return Changed;
 }
 
-PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  assert((LI && DT) && "Analyses for LoopSimplifyCFG not available");
-
-  if (!simplifyLoopCFG(L, *DT, *LI))
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &) {
+  if (!simplifyLoopCFG(L, AR.DT, AR.LI))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index f64354497771..f3f415275c0e 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
@@ -47,6 +46,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -283,6 +283,9 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // sinked.
   for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
     Instruction *I = &*II++;
+    // No need to check for instruction's operands are loop invariant.
+    assert(L.hasLoopInvariantOperands(I) &&
+           "Insts in a loop's preheader should have loop invariant operands!");
     if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
       continue;
     if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a61f646042ae..a1561fc0a6c2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -59,16 +59,15 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -80,13 +79,13 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
@@ -99,6 +98,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -5052,21 +5052,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
 }
 
-PreservedAnalyses LoopStrengthReducePass::run(Loop &L,
-                                              LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto &IU = AM.getResult<IVUsersAnalysis>(L);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  assert((SE && DT && LI && TTI) &&
-         "Analyses for Loop Strength Reduce not available");
-
-  if (!ReduceLoopStrength(&L, IU, *SE, *DT, *LI, *TTI))
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
+  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+                          AR.DT, AR.LI, AR.TTI))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f66369b30369..c7f91226d222 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -33,6 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <climits>
@@ -1111,41 +1111,23 @@ Pass *llvm::createSimpleLoopUnrollPass() {
   return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
 }
 
-PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {
+PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
   const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
 
-
-  DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  LoopInfo *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  ScalarEvolution *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
   auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  if (!DT)
-    report_fatal_error(
-        "LoopUnrollPass: DominatorTreeAnalysis not cached at a higher level");
-  if (!LI)
-    report_fatal_error(
-        "LoopUnrollPass: LoopAnalysis not cached at a higher level");
-  if (!SE)
-    report_fatal_error(
-        "LoopUnrollPass: ScalarEvolutionAnalysis not cached at a higher level");
-  if (!TTI)
-    report_fatal_error(
-        "LoopUnrollPass: TargetIRAnalysis not cached at a higher level");
-  if (!AC)
-    report_fatal_error(
-        "LoopUnrollPass: AssumptionAnalysis not cached at a higher level");
+  // FIXME: This should probably be optional rather than required.
   if (!ORE)
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed =
-      tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true,
-                      ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
-                      ProvidedRuntime, ProvidedUpperBound);
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+                                 /*PreserveLCSSA*/ true, ProvidedCount,
+                                 ProvidedThreshold, ProvidedAllowPartial,
+                                 ProvidedRuntime, ProvidedUpperBound);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index eef7db08cd46..e1b6741f31b4 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -135,6 +135,10 @@ struct CongruenceClass {
   // purposes, and for skipping empty classes.
   bool Dead = false;
 
+  // Number of stores in this congruence class.
+  // This is used so we can detect store equivalence changes properly.
+  int StoreCount = 0;
+
   explicit CongruenceClass(unsigned ID) : ID(ID) {}
   CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
       : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
@@ -198,7 +202,7 @@ class NewGVN : public FunctionPass {
   ExpressionClassMap ExpressionToClass;
 
   // Which values have changed as a result of leader changes.
-  SmallPtrSet<Value *, 8> ChangedValues;
+  SmallPtrSet<Value *, 8> LeaderChanges;
 
   // Reachability info.
   using BlockEdge = BasicBlockEdge;
@@ -317,7 +321,8 @@ private:
   template <class T>
   Value *lookupOperandLeader(Value *, const User *, const T &) const;
   void performCongruenceFinding(Value *, const Expression *);
-
+  void moveValueToNewCongruenceClass(Value *, CongruenceClass *,
+                                     CongruenceClass *);
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
@@ -347,7 +352,8 @@ private:
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
   void updateProcessedCount(Value *V);
-  void verifyMemoryCongruency();
+  void verifyMemoryCongruency() const;
+  bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
 };
 
 char NewGVN::ID = 0;
@@ -717,10 +723,10 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
 // Utility function to check whether the congruence class has a member other
 // than the given instruction.
 bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one member, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one member
+  // Either it has more than one store, in which case it must contain something
+  // other than us (because it's indexed by value), or if it only has one store
   // right now, that member should not be us.
-  return CC->Members.size() > 1 || CC->Members.count(I) == 0;
+  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
 }
 
 const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
@@ -1044,7 +1050,40 @@ void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
   for (auto M : CC->Members) {
     if (auto *I = dyn_cast<Instruction>(M))
       TouchedInstructions.set(InstrDFS[I]);
-    ChangedValues.insert(M);
+    LeaderChanges.insert(M);
+  }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass for the move (including changing leaders, etc)
+void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
+                                           CongruenceClass *NewClass) {
+  DEBUG(dbgs() << "New congruence class for " << V << " is " << NewClass->ID
+               << "\n");
+  OldClass->Members.erase(V);
+  NewClass->Members.insert(V);
+  if (isa<StoreInst>(V)) {
+    --OldClass->StoreCount;
+    assert(OldClass->StoreCount >= 0);
+    ++NewClass->StoreCount;
+    assert(NewClass->StoreCount > 0);
+  }
+
+  ValueToClass[V] = NewClass;
+  // See if we destroyed the class or need to swap leaders.
+  if (OldClass->Members.empty() && OldClass != InitialClass) {
+    if (OldClass->DefiningExpr) {
+      OldClass->Dead = true;
+      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+                   << " from table\n");
+      ExpressionToClass.erase(OldClass->DefiningExpr);
+    }
+  } else if (OldClass->RepLeader == V) {
+    // When the leader changes, the value numbering of
+    // everything may change due to symbolization changes, so we need to
+    // reprocess.
+    OldClass->RepLeader = *(OldClass->Members.begin());
+    markLeaderChangeTouched(OldClass);
   }
 }
 
@@ -1101,33 +1140,16 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
       assert(!EClass->Dead && "We accidentally looked up a dead class");
     }
   }
-  bool WasInChanged = ChangedValues.erase(V);
-  if (VClass != EClass || WasInChanged) {
+  bool ClassChanged = VClass != EClass;
+  bool LeaderChanged = LeaderChanges.erase(V);
+  if (ClassChanged || LeaderChanged) {
     DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
                  << "\n");
 
-    if (VClass != EClass) {
-      DEBUG(dbgs() << "New congruence class for " << V << " is " << EClass->ID
-                   << "\n");
-
-      VClass->Members.erase(V);
-      EClass->Members.insert(V);
-      ValueToClass[V] = EClass;
-      // See if we destroyed the class or need to swap leaders.
-      if (VClass->Members.empty() && VClass != InitialClass) {
-        if (VClass->DefiningExpr) {
-          VClass->Dead = true;
-          DEBUG(dbgs() << "Erasing expression " << *E << " from table\n");
-          ExpressionToClass.erase(VClass->DefiningExpr);
-        }
-      } else if (VClass->RepLeader == V) {
-        // When the leader changes, the value numbering of
-        // everything may change due to symbolization changes, so we need to
-        // reprocess.
-        VClass->RepLeader = *(VClass->Members.begin());
-        markLeaderChangeTouched(VClass);
-      }
-    }
+    if (ClassChanged)
+
+      moveValueToNewCongruenceClass(V, VClass, EClass);
+
 
     markUsersTouched(V);
     if (auto *I = dyn_cast<Instruction>(V)) {
@@ -1315,9 +1337,12 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
       // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
       // other expression can generate a memory equivalence.  If we start
       // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I))
+      if (isa<StoreInst>(&I)) {
         MemoryAccessEquiv.insert(
             {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
+        ++InitialClass->StoreCount;
+        assert(InitialClass->StoreCount > 0);
+      }
     }
   }
   InitialClass->Members.swap(InitialValues);
@@ -1454,9 +1479,40 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
   }
 }
 
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
+                                    const MemoryAccess *Second) const {
+  if (First == Second)
+    return true;
+
+  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
+    auto *DefAccess = FirstDef->getDefiningAccess();
+    return singleReachablePHIPath(DefAccess, Second);
+  } else {
+    auto *MP = cast<MemoryPhi>(First);
+    auto ReachableOperandPred = [&](const Use &U) {
+      return ReachableBlocks.count(MP->getIncomingBlock(U));
+    };
+    auto FilteredPhiArgs =
+        make_filter_range(MP->operands(), ReachableOperandPred);
+    SmallVector<const Value *, 32> OperandList;
+    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+              std::back_inserter(OperandList));
+    bool Okay = OperandList.size() == 1;
+    if (!Okay)
+      Okay = std::equal(OperandList.begin(), OperandList.end(),
+                        OperandList.begin());
+    if (Okay)
+      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+    return false;
+  }
+}
+
 // Verify the that the memory equivalence table makes sense relative to the
-// congruence classes.
-void NewGVN::verifyMemoryCongruency() {
+// congruence classes.  Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
   // Anything equivalent in the memory access table should be in the same
   // congruence class.
 
@@ -1483,11 +1539,12 @@ void NewGVN::verifyMemoryCongruency() {
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
       auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
       if (FirstMUD && SecondMUD)
-        assert(
-            ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
-                ValueToClass.lookup(SecondMUD->getMemoryInst()) &&
-            "The instructions for these memory operations should have been in "
-            "the same congruence class");
+        assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
+               ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                       ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+                   "The instructions for these memory operations should have "
+                   "been in the same congruence class or reachable through"
+                   "a single argument phi");
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
 
       // We can only sanely verify that MemoryDefs in the operand list all have
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index fa2235e8439a..49ce0262c97b 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -792,6 +792,7 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                          LoopFunc,
                          LoopStart);
     BranchInst::Create(LoopStart, NewEntry);
+    DT->setNewRoot(NewEntry);
   }
 
   // Create an extra loop end node
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e551e4b47ac1..f9a602bc268a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -172,6 +172,36 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
   return false;
 }
 
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+                                           BasicBlock *ClonedBB, LoopInfo *LI,
+                                           NewLoopsMap &NewLoops) {
+  // Figure out which loop New is in.
+  const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+  assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+  Loop *&NewLoop = NewLoops[OldLoop];
+  if (!NewLoop) {
+    // Found a new sub-loop.
+    assert(OriginalBB == OldLoop->getHeader() &&
+           "Header should be first in RPO");
+
+    Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+    assert(NewLoopParent &&
+           "Expected parent loop before sub-loop in RPO");
+    NewLoop = new Loop;
+    NewLoopParent->addChildLoop(NewLoop);
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return OldLoop;
+  } else {
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return nullptr;
+  }
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -428,28 +458,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
         assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
         L->addBasicBlockToLoop(New, *LI);
       } else {
-        // Figure out which loop New is in.
-        const Loop *OldLoop = LI->getLoopFor(*BB);
-        assert(OldLoop && "Should (at least) be in the loop being unrolled!");
-
-        Loop *&NewLoop = NewLoops[OldLoop];
-        if (!NewLoop) {
-          // Found a new sub-loop.
-          assert(*BB == OldLoop->getHeader() &&
-                 "Header should be first in RPO");
-
-          Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
-          assert(NewLoopParent &&
-                 "Expected parent loop before sub-loop in RPO");
-          NewLoop = new Loop;
-          NewLoopParent->addChildLoop(NewLoop);
-          LoopsToSimplify.insert(NewLoop);
+        const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+        if (OldLoop) {
+          LoopsToSimplify.insert(NewLoops[OldLoop]);
 
           // Forget the old loop, since its inputs may have changed.
           if (SE)
             SE->forgetLoop(OldLoop);
         }
-        NewLoop->addBasicBlockToLoop(New, *LI);
       }
 
       if (*BB == Header)
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5758a415f12b..85da3ba899a5 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -301,15 +301,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
       LI->addTopLevelLoop(NewLoop);
   }
 
+  NewLoopsMap NewLoops;
+  NewLoops[L] = NewLoop;
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
 
-    if (NewLoop)
-      NewLoop->addBasicBlockToLoop(NewBB, *LI);
-    else if (ParentLoop)
+    if (NewLoop) {
+      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+    } else if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 09e9f1ddc7fe..c8efa9efc7f3 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -869,8 +869,13 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
     return false;
   }
 
-  assert(TheLoop->getHeader() == Phi->getParent() &&
-         "PHI is an AddRec for a different loop?!");
+  if (AR->getLoop() != TheLoop) {
+    // FIXME: We should treat this as a uniform. Unfortunately, we
+    // don't currently know how to handled uniform PHIs.
+    DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+    return false;    
+  }
+
   Value *StartValue =
     Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
   const SCEV *Step = AR->getStepRecurrence(*SE);
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 54390e77bb1f..6e30919246c7 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1275,10 +1275,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
                            LLVMContext::MD_mem_parallel_loop_access};
     combineMetadata(I1, I2, KnownIDs);
 
-    // If the debug loc for I1 and I2 are different, as we are combining them
-    // into one instruction, we do not want to select debug loc randomly from 
-    // I1 or I2.
-    if (!isa<CallInst>(I1) &&  I1->getDebugLoc() != I2->getDebugLoc())
+    // I1 and I2 are being combined into a single instruction.  Its debug
+    // location is the merged locations of the original instructions.
+    if (!isa<CallInst>(I1))
       I1->setDebugLoc(
           DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
  
@@ -1577,7 +1576,7 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   // The debug location for the "common" instruction is the merged locations of
   // all the commoned instructions.  We start with the original location of the
   // "common" instruction and iteratively merge each location in the loop below.
-  DILocation *Loc = I0->getDebugLoc();
+  const DILocation *Loc = I0->getDebugLoc();
 
   // Update metadata and IR flags, and merge debug locations.
   for (auto *I : Insts)
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 11d54bcf4f89..8eaeb1073a76 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1074,6 +1074,24 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0
     return ConstantFP::get(CI->getType(), 1.0);
 
+  if (Op2C->isExactlyValue(-0.5) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                      LibFunc::sqrtl)) {
+    // If -ffast-math:
+    // pow(x, -0.5) -> 1.0 / sqrt(x)
+    if (CI->hasUnsafeAlgebra()) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+
+      // Here we cannot lower to an intrinsic because C99 sqrt() and llvm.sqrt
+      // are not guaranteed to have the same semantics.
+      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+                                         Callee->getAttributes());
+
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
+    }
+  }
+
   if (Op2C->isExactlyValue(0.5) &&
       hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
                       LibFunc::sqrtl) &&
@@ -1121,6 +1139,10 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
         !V.isInteger())
       return nullptr;
 
+    // Propagate fast math flags.
+    IRBuilder<>::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(CI->getFastMathFlags());
+
     // We will memoize intermediate products of the Addition Chain.
     Value *InnerChain[33] = {nullptr};
     InnerChain[1] = Op1;
@@ -1131,7 +1153,6 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     bool ignored;
     V.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
     
-    // TODO: Should the new instructions propagate the 'fast' flag of the pow()?
     Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
     // For negative exponents simply compute the reciprocal.
     if (Op2C->isNegative())
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 578c65daf7c0..1b1f86f8efdc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
@@ -6949,9 +6950,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
-                                      Op1VP, Op2VP);
+    SmallVector<const Value *, 4> Operands(I->operand_values()); 
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7641,7 +7642,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
-    auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
     auto &AA = AM.getResult<AAManager>(F);
     auto &AC = AM.getResult<AssumptionAnalysis>(F);
     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
@@ -7650,10 +7651,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
-      return LAM.getResult<LoopAccessAnalysis>(L);
+      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+      return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     bool Changed =
-        runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA, ORE);
+        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
     if (!Changed)
       return PreservedAnalyses::all();
     PreservedAnalyses PA;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bcaa8439cffa..1c7cbc7edf9a 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2493,10 +2493,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (LHS == RHS && isa<Instruction>(LHS)) {
-        assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
-      }
-
       if (Value *V = alreadyVectorized(E->Scalars))
         return V;
 
diff --git a/lib/XRay/CMakeLists.txt b/lib/XRay/CMakeLists.txt
new file mode 100644
index 000000000000..6c1acba79bfa
--- /dev/null
+++ b/lib/XRay/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMXRay
+  Trace.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/XRay
+
+  DEPENDS
+  LLVMSupport
+
+  LINK_LIBS
+  LLVMSupport
+  )
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
new file mode 100644
index 000000000000..51000c777de8
--- /dev/null
+++ b/lib/XRay/Trace.cpp
@@ -0,0 +1,196 @@
+//===- Trace.cpp - XRay Trace Loading implementation. ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay log reader implementation.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/XRay/Trace.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/XRay/YAMLXRayRecord.h"
+
+using namespace llvm;
+using namespace llvm::xray;
+using llvm::yaml::Input;
+
+using XRayRecordStorage =
+    std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
+
+Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
+                     std::vector<XRayRecord> &Records) {
+  // FIXME: Maybe deduce whether the data is little or big-endian using some
+  // magic bytes in the beginning of the file?
+
+  // First 32 bytes of the file will always be the header. We assume a certain
+  // format here:
+  //
+  //   (2)   uint16 : version
+  //   (2)   uint16 : type
+  //   (4)   uint32 : bitfield
+  //   (8)   uint64 : cycle frequency
+  //   (16)  -      : padding
+  //
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  DataExtractor HeaderExtractor(Data, true, 8);
+  uint32_t OffsetPtr = 0;
+  FileHeader.Version = HeaderExtractor.getU16(&OffsetPtr);
+  FileHeader.Type = HeaderExtractor.getU16(&OffsetPtr);
+  uint32_t Bitfield = HeaderExtractor.getU32(&OffsetPtr);
+  FileHeader.ConstantTSC = Bitfield & 1uL;
+  FileHeader.NonstopTSC = Bitfield & 1uL << 1;
+  FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
+
+  if (FileHeader.Version != 1)
+    return make_error<StringError>(
+        Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
+        std::make_error_code(std::errc::invalid_argument));
+
+  // Each record after the header will be 32 bytes, in the following format:
+  //
+  //   (2)   uint16 : record type
+  //   (1)   uint8  : cpu id
+  //   (1)   uint8  : type
+  //   (4)   sint32 : function id
+  //   (8)   uint64 : tsc
+  //   (4)   uint32 : thread id
+  //   (12)  -      : padding
+  for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(32)) {
+    DataExtractor RecordExtractor(S, true, 8);
+    uint32_t OffsetPtr = 0;
+    Records.emplace_back();
+    auto &Record = Records.back();
+    Record.RecordType = RecordExtractor.getU16(&OffsetPtr);
+    Record.CPU = RecordExtractor.getU8(&OffsetPtr);
+    auto Type = RecordExtractor.getU8(&OffsetPtr);
+    switch (Type) {
+    case 0:
+      Record.Type = RecordTypes::ENTER;
+      break;
+    case 1:
+      Record.Type = RecordTypes::EXIT;
+      break;
+    default:
+      return make_error<StringError>(
+          Twine("Unknown record type '") + Twine(int{Type}) + "'",
+          std::make_error_code(std::errc::executable_format_error));
+    }
+    Record.FuncId = RecordExtractor.getSigned(&OffsetPtr, sizeof(int32_t));
+    Record.TSC = RecordExtractor.getU64(&OffsetPtr);
+    Record.TId = RecordExtractor.getU32(&OffsetPtr);
+  }
+  return Error::success();
+}
+
+Error YAMLLogLoader(StringRef Data, XRayFileHeader &FileHeader,
+                    std::vector<XRayRecord> &Records) {
+
+  // Load the documents from the MappedFile.
+  YAMLXRayTrace Trace;
+  Input In(Data);
+  In >> Trace;
+  if (In.error())
+    return make_error<StringError>("Failed loading YAML Data.", In.error());
+
+  FileHeader.Version = Trace.Header.Version;
+  FileHeader.Type = Trace.Header.Type;
+  FileHeader.ConstantTSC = Trace.Header.ConstantTSC;
+  FileHeader.NonstopTSC = Trace.Header.NonstopTSC;
+  FileHeader.CycleFrequency = Trace.Header.CycleFrequency;
+
+  if (FileHeader.Version != 1)
+    return make_error<StringError>(
+        Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
+        std::make_error_code(std::errc::invalid_argument));
+
+  Records.clear();
+  std::transform(Trace.Records.begin(), Trace.Records.end(),
+                 std::back_inserter(Records), [&](const YAMLXRayRecord &R) {
+                   return XRayRecord{R.RecordType, R.CPU, R.Type,
+                                     R.FuncId,     R.TSC, R.TId};
+                 });
+  return Error::success();
+}
+
+Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
+  int Fd;
+  if (auto EC = sys::fs::openFileForRead(Filename, Fd)) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+
+  // Attempt to get the filesize.
+  uint64_t FileSize;
+  if (auto EC = sys::fs::file_size(Filename, FileSize)) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+  if (FileSize < 4) {
+    return make_error<StringError>(
+        Twine("File '") + Filename + "' too small for XRay.",
+        std::make_error_code(std::errc::executable_format_error));
+  }
+
+  // Attempt to mmap the file.
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+
+  // Attempt to detect the file type using file magic. We have a slight bias
+  // towards the binary format, and we do this by making sure that the first 4
+  // bytes of the binary file is some combination of the following byte
+  // patterns:
+  //
+  //   0x0001 0x0000 - version 1, "naive" format
+  //   0x0001 0x0001 - version 1, "flight data recorder" format
+  //
+  // YAML files dont' typically have those first four bytes as valid text so we
+  // try loading assuming YAML if we don't find these bytes.
+  //
+  // Only if we can't load either the binary or the YAML format will we yield an
+  // error.
+  StringRef Magic(MappedFile.data(), 4);
+  DataExtractor HeaderExtractor(Magic, true, 8);
+  uint32_t OffsetPtr = 0;
+  uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
+  uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
+
+  Trace T;
+  if (Version == 1 && (Type == 0 || Type == 1)) {
+    if (auto E = NaiveLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
+                                T.FileHeader, T.Records))
+      return std::move(E);
+  } else {
+    if (auto E = YAMLLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
+                               T.FileHeader, T.Records))
+      return std::move(E);
+  }
+
+  if (Sort)
+    std::sort(T.Records.begin(), T.Records.end(),
+              [&](const XRayRecord &L, const XRayRecord &R) {
+                return L.TSC < R.TSC;
+              });
+
+  return std::move(T);
+}
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index bf802a3b4ea8..f5d16952b406 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -61,6 +61,10 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   # This variable makes sure that e.g. llvm-lit is found.
   set(LLVM_MAIN_SRC_DIR ${LLVM_BUILD_MAIN_SRC_DIR})
 
+  # Handle common options used by all runtimes.
+  include(AddLLVM)
+  include(HandleLLVMOptions)
+
   foreach(entry ${runtimes})
     get_filename_component(projName ${entry} NAME)
 
diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
index 58750721cb97..085863554f00 100644
--- a/test/Analysis/CostModel/AArch64/store.ll
+++ b/test/Analysis/CostModel/AArch64/store.ll
@@ -1,17 +1,59 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios | FileCheck %s
-; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 ; CHECK-LABEL: getMemoryOpCost
 ; SLOW_MISALIGNED_128_STORE-LABEL: getMemoryOpCost
 define void @getMemoryOpCost() {
-    ; If FeatureSlowMisaligned128Store is set, we penalize <2 x i64> stores. On
-    ; Cyclone, for example, such stores should be expensive because we don't
-    ; split them and misaligned 16b stores have bad performance.
-    ;
-    ; CHECK: cost of 1 {{.*}} store
-    ; SLOW_MISALIGNED_128_STORE: cost of 12 {{.*}} store
+    ; If FeatureSlowMisaligned128Store is set, we penalize 128-bit stores.
+    ; The unlegalized 256-bit stores are further penalized when legalized down
+    ; to 128-bit stores. 
+
+    ; CHECK: cost of 2 for {{.*}} store <4 x i64>
+    ; SLOW_MISALIGNED_128_STORE: cost of 24 for {{.*}} store <4 x i64>
+    store <4 x i64> undef, <4 x i64> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <8 x i32>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x i32>
+    store <8 x i32> undef, <8 x i32> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <16 x i16>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x i16>
+    store <16 x i16> undef, <16 x i16> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <32 x i8>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <32 x i8>
+    store <32 x i8> undef, <32 x i8> * undef
+
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <4 x double>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <4 x double>
+    store <4 x double> undef, <4 x double> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <8 x float>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x float>
+    store <8 x float> undef, <8 x float> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <16 x half>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x half>
+    store <16 x half> undef, <16 x half> * undef
+
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <2 x i64>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x i64>
     store <2 x i64> undef, <2 x i64> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <4 x i32>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x i32>
+    store <4 x i32> undef, <4 x i32> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <8 x i16>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x i16>
+    store <8 x i16> undef, <8 x i16> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <16 x i8>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <16 x i8>
+    store <16 x i8> undef, <16 x i8> * undef
+
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <2 x double>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x double>
+    store <2 x double> undef, <2 x double> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <4 x float>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x float>
+    store <4 x float> undef, <4 x float> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <8 x half>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x half>
+    store <8 x half> undef, <8 x half> * undef
 
     ; We scalarize the loads/stores because there is no vector register name for
     ; these types (they get extended to v.4h/v.2s).
diff --git a/test/Analysis/CostModel/X86/slm-arith-costs.ll b/test/Analysis/CostModel/X86/slm-arith-costs.ll
new file mode 100644
index 000000000000..3673a5d9e067
--- /dev/null
+++ b/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -0,0 +1,317 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; 8bit mul
+define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i8
+  %res = mul nsw i8 %a, %b
+  ret i8 %res
+}
+
+define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i8>
+  %res = mul nsw <2 x i8> %a, %b
+  ret <2 x i8> %res
+}
+
+define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i8>
+  %res = mul nsw <4 x i8> %a, %b
+  ret <4 x i8> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32> 
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 -1, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 256, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 128, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -129, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i8>
+  %res = mul nsw <8 x i8> %a, %b
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b)  {
+entry:
+; SLM:  cost of 14 {{.*}} mul nsw <16 x i8>
+  %res = mul nsw <16 x i8> %a, %b
+  ret <16 x i8> %res
+}
+
+; 16bit mul
+define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i16
+  %res = mul nsw i16 %a, %b
+  ret i16 %res
+}
+
+define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i16>
+  %res = mul nsw <2 x i16> %a, %b
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i16>
+  %res = mul nsw <4 x i16> %a, %b
+  ret <4 x i16> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65535, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 -1, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65536, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32768, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32769>
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i16>
+  %res = mul nsw <8 x i16> %a, %b
+  ret <8 x i16> %res
+}
+
+define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} mul nsw <16 x i16>
+  %res = mul nsw <16 x i16> %a, %b
+  ret <16 x i16> %res
+}
+
+; 32bit mul
+define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i32
+  %res = mul nsw i32 %a, %b
+  ret i32 %res 
+}
+
+define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i32>
+  %res = mul nsw <2 x i32> %a, %b
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %res = mul nsw <4 x i32> %a, %b
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <8 x i32>
+  %res = mul nsw <8 x i32> %a, %b
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <16 x i32>
+  %res = mul nsw <16 x i32> %a, %b
+  ret <16 x i32> %res
+}
+
+; 64bit mul
+define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i64
+  %res = mul nsw i64 %a, %b
+  ret i64 %res
+}
+
+define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i64>
+  %res = mul nsw <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <4 x i64>
+  %res = mul nsw <4 x i64> %a, %b
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <8 x i64>
+  %res = mul nsw <8 x i64> %a, %b
+  ret <8 x i64> %res
+}
+
+define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b)  {
+entry:
+; SLM:  cost of 88 {{.*}} mul nsw <16 x i64>
+  %res = mul nsw <16 x i64> %a, %b
+  ret <16 x i64> %res
+}
+
+; mulsd
+define double @slm-costs_mulsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul double
+  %res = fmul double %a, %b
+  ret double %res
+}
+
+; mulpd
+define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} fmul <2 x double>
+  %res = fmul <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; mulps
+define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul <4 x float>
+  %res = fmul <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divss
+define float @slm-costs_divss(float %a, float %b)  {
+entry:
+; SLM:  cost of 17 {{.*}} fdiv float
+  %res = fdiv float %a, %b
+  ret float %res
+}
+
+; divps
+define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 39 {{.*}} fdiv <4 x float>
+  %res = fdiv <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divsd
+define double @slm-costs_divsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 32 {{.*}} fdiv double
+  %res = fdiv double %a, %b
+  ret double %res
+}
+
+; divpd
+define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 69 {{.*}} fdiv <2 x double>
+  %res = fdiv <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; addpd
+define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fadd <2 x double>
+  %res = fadd <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; subpd
+define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fsub <2 x double>
+  %res = fsub <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
diff --git a/test/Analysis/CostModel/X86/strided-load-i16.ll b/test/Analysis/CostModel/X86/strided-load-i16.ll
index 2c2cf3938bcb..ef786e750950 100755
--- a/test/Analysis/CostModel/X86/strided-load-i16.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -1,113 +1,113 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i16] zeroinitializer, align 16
-@B = global [10240 x i16] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i16_stride2() {
-;CHECK-LABEL: load_i16_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride3() {
-;CHECK-LABEL: load_i16_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride4() {
-;CHECK-LABEL: load_i16_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride5() {
-;CHECK-LABEL: load_i16_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i16] zeroinitializer, align 16
+@B = global [10240 x i16] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i16_stride2() {
+;CHECK-LABEL: load_i16_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride3() {
+;CHECK-LABEL: load_i16_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride4() {
+;CHECK-LABEL: load_i16_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride5() {
+;CHECK-LABEL: load_i16_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i32.ll b/test/Analysis/CostModel/X86/strided-load-i32.ll
index 0dcd3929da7f..fad9def2bf78 100755
--- a/test/Analysis/CostModel/X86/strided-load-i32.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -1,110 +1,110 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i32] zeroinitializer, align 16
-@B = global [10240 x i32] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_int_stride2() {
-;CHECK-LABEL: load_int_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride3() {
-;CHECK-LABEL: load_int_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride4() {
-;CHECK-LABEL: load_int_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride5() {
-;CHECK-LABEL: load_int_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_int_stride2() {
+;CHECK-LABEL: load_int_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride3() {
+;CHECK-LABEL: load_int_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride4() {
+;CHECK-LABEL: load_int_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride5() {
+;CHECK-LABEL: load_int_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/X86/strided-load-i64.ll b/test/Analysis/CostModel/X86/strided-load-i64.ll
index 0370b6f80efd..a7f593017d60 100755
--- a/test/Analysis/CostModel/X86/strided-load-i64.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -1,81 +1,81 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i64] zeroinitializer, align 16
-@B = global [10240 x i64] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i64_stride2() {
-;CHECK-LABEL: load_i64_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i64_stride3() {
-;CHECK-LABEL: load_i64_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i64_stride4() {
-;CHECK-LABEL: load_i64_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 4
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i64_stride2() {
+;CHECK-LABEL: load_i64_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride3() {
+;CHECK-LABEL: load_i64_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride4() {
+;CHECK-LABEL: load_i64_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 4
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i8.ll b/test/Analysis/CostModel/X86/strided-load-i8.ll
index 2a3a83864151..a97a32c5c940 100755
--- a/test/Analysis/CostModel/X86/strided-load-i8.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -1,117 +1,117 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i8] zeroinitializer, align 16
-@B = global [10240 x i8] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i8_stride2() {
-;CHECK-LABEL: load_i8_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride3() {
-;CHECK-LABEL: load_i8_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride4() {
-;CHECK-LABEL: load_i8_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride5() {
-;CHECK-LABEL: load_i8_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i8] zeroinitializer, align 16
+@B = global [10240 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i8_stride2() {
+;CHECK-LABEL: load_i8_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride3() {
+;CHECK-LABEL: load_i8_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride4() {
+;CHECK-LABEL: load_i8_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride5() {
+;CHECK-LABEL: load_i8_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index 6756f3ba2802..13519018d957 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -121,6 +124,8 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -146,6 +151,8 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 24 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -158,7 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -283,6 +292,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i16> %a, %splat
@@ -322,7 +333,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -440,6 +453,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -476,7 +491,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -593,6 +610,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -632,6 +651,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 4 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 8 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 16 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 63e6db194d52..1e0fbce710ef 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -124,6 +127,8 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -160,7 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -288,6 +295,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i16> %a, %splat
@@ -327,7 +336,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -448,6 +459,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -484,7 +497,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -603,6 +618,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -642,6 +659,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 4 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 8c42bd66c707..031d530dcd56 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector shift left instructions.
 
@@ -161,7 +164,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -289,6 +294,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i16> %a, %splat
@@ -328,7 +335,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -450,6 +459,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -487,7 +498,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -608,6 +621,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -647,6 +662,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 4 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/ScalarEvolution/max-trip-count.ll b/test/Analysis/ScalarEvolution/max-trip-count.ll
index 614e9b265ab0..d87e7d033a1e 100644
--- a/test/Analysis/ScalarEvolution/max-trip-count.ll
+++ b/test/Analysis/ScalarEvolution/max-trip-count.ll
@@ -207,3 +207,84 @@ for.cond.i:                                       ; preds = %for.body.i
 bar.exit:                                         ; preds = %for.cond.i, %for.body.i
   ret i32 0
 }
+
+; CHECK-LABEL: @ne_max_trip_count_1
+; CHECK: Loop %for.body: max backedge-taken count is 7
+define i32 @ne_max_trip_count_1(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %i, %masked
+  br i1 %cmp, label %for.body, label %bar.exit
+
+bar.exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_2
+; CHECK: Loop %for.body: max backedge-taken count is -1
+define i32 @ne_max_trip_count_2(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %masked
+  br i1 %cmp, label %for.body, label %bar.exit
+
+bar.exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_3
+; CHECK: Loop %for.body: max backedge-taken count is 6
+define i32 @ne_max_trip_count_3(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  %guard = icmp eq i32 %masked, 0
+  br i1 %guard, label %exit, label %for.preheader
+
+for.preheader:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %masked
+  br i1 %cmp, label %for.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_4
+; CHECK: Loop %for.body: max backedge-taken count is -2
+define i32 @ne_max_trip_count_4(i32 %n) {
+entry:
+  %guard = icmp eq i32 %n, 0
+  br i1 %guard, label %exit, label %for.preheader
+
+for.preheader:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %n
+  br i1 %cmp, label %for.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index e91a1a42c233..8d9a8c06aa3c 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -904,8 +904,9 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
 
 ; CHECK-LABEL: test_extracts_inserts_varidx_extract:
 ; CHECK: str q0
-; CHECK: add x[[PTR:[0-9]+]], {{.*}}, w0, sxtw #1
-; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], [x[[PTR]]]
+; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7
+; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3
+; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], {{\[}}[[PTR]]{{\]}}
 ; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
 ; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
 ; CHECK-DAG: ins v[[R]].h[3], v0.h[3]
@@ -922,7 +923,9 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
 }
 
 ; CHECK-LABEL: test_extracts_inserts_varidx_insert:
-; CHECK: str h0, [{{.*}}, w0, sxtw #1]
+; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
+; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
+; CHECK: st1 { v0.h }[0], [x9]
 ; CHECK-DAG: ldr d[[R:[0-9]+]]
 ; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
 ; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
index c3a1640ab012..ba2512718c4e 100644
--- a/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -1,10 +1,12 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
 
 ; CHECK-LABEL: _test:
-; CHECK:  fmov.2d v0, #2.00000000
-; CHECK:  str  q0, [sp, #-16]!
-; CHECK:  mov  x8, sp
-; CHECK:  ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG:  fmov.2d v0, #2.00000000
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG:  mov  x9, sp
+; CHECK-DAG:  str  q0, [sp], #16
+; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK:  ldr s0, {{\[}}[[PTR]]{{\]}}
 ; CHECK:  str  s0, [x0]
 
 define void @test(float * %p1, i32 %v1) {
@@ -16,9 +18,11 @@ entry:
 
 ; CHECK-LABEL: _test2
 ; CHECK: movi.16b  v0, #63
-; CHECK: str  q0, [sp, #-16]!
-; CHECK: mov  x8, sp
-; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG: str  q0, [sp], #16
+; CHECK-DAG: mov  x9, sp
+; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
 ; CHECK: str  s0, [x0]
 
 define void @test2(float * %p1, i32 %v1) {
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
index 135bce3bdb6c..85496ab03214 100644
--- a/test/CodeGen/AArch64/bitreverse.ll
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -1,14 +1,18 @@
 ; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
 
-; These tests just check that the plumbing is in place for @llvm.bitreverse. The
-; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+; These tests just check that the plumbing is in place for @llvm.bitreverse.
 
 declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
 
 define <2 x i16> @f(<2 x i16> %a) {
 ; CHECK-LABEL: f:
-; CHECK: rev32
-; CHECK: ushr
+; CHECK: fmov [[REG1:w[0-9]+]], s0
+; CHECK-DAG: rbit [[REG2:w[0-9]+]], [[REG1]]
+; CHECK-DAG: fmov s0, [[REG2]]
+; CHECK-DAG: mov [[REG3:w[0-9]+]], v0.s[1]
+; CHECK-DAG: rbit [[REG4:w[0-9]+]], [[REG3]]
+; CHECK-DAG: ins v0.s[1], [[REG4]]
+; CHECK-DAG: ushr v0.2s, v0.2s, #16
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %b
 }
@@ -17,26 +21,9 @@ declare i8 @llvm.bitreverse.i8(i8) readnone
 
 define i8 @g(i8 %a) {
 ; CHECK-LABEL: g:
-; CHECK-DAG: rev [[RV:w.*]], w0
-; CHECK-DAG: and [[L4:w.*]], [[RV]], #0xf0f0f0f
-; CHECK-DAG: and [[H4:w.*]], [[RV]], #0xf0f0f0f0
-; CHECK-DAG: lsr [[S4:w.*]], [[H4]], #4
-; CHECK-DAG: orr [[R4:w.*]], [[S4]], [[L4]], lsl #4
-
-; CHECK-DAG: and [[L2:w.*]], [[R4]], #0x33333333
-; CHECK-DAG: and [[H2:w.*]], [[R4]], #0xcccccccc
-; CHECK-DAG: lsr [[S2:w.*]], [[H2]], #2
-; CHECK-DAG: orr [[R2:w.*]], [[S2]], [[L2]], lsl #2
-
-; CHECK-DAG: mov [[P1:w.*]], #1426063360
-; CHECK-DAG: mov [[N1:w.*]], #-1442840576
-; CHECK-DAG: and [[L1:w.*]], [[R2]], [[P1]]
-; CHECK-DAG: and [[H1:w.*]], [[R2]], [[N1]]
-; CHECK-DAG: lsr [[S1:w.*]], [[H1]], #1
-; CHECK-DAG: orr [[R1:w.*]], [[S1]], [[L1]], lsl #1
-
-; CHECK-DAG: lsr w0, [[R1]], #24
-; CHECK-DAG: ret
+; CHECK: rbit [[REG:w[0-9]+]], w0
+; CHECK-NEXT: lsr w0, [[REG]], #24
+; CHECK-NEXT: ret
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
 }
diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
index 3404ae4b6bee..288a25bd65e3 100644
--- a/test/CodeGen/AArch64/rbit.ll
+++ b/test/CodeGen/AArch64/rbit.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
 
+; The llvm.aarch64.rbit intrinsic should be auto-upgraded to the
+; target-independent bitreverse intrinsic.
+
 ; CHECK-LABEL: rbit32
 ; CHECK: rbit w0, w0
 define i32 @rbit32(i32 %t) {
@@ -18,3 +21,22 @@ entry:
 
 declare i64 @llvm.aarch64.rbit.i64(i64)
 declare i32 @llvm.aarch64.rbit.i32(i32)
+
+; CHECK-LABEL: rbit_generic32
+; CHECK: rbit w0, w0
+define i32 @rbit_generic32(i32 %t) {
+entry:
+  %rbit = call i32 @llvm.bitreverse.i32(i32 %t)
+  ret i32 %rbit
+}
+
+; CHECK-LABEL: rbit_generic64
+; CHECK: rbit x0, x0
+define i64 @rbit_generic64(i64 %t) {
+entry:
+  %rbit = call i64 @llvm.bitreverse.i64(i64 %t)
+  ret i64 %rbit
+}
+
+declare i32 @llvm.bitreverse.i32(i32) readnone
+declare i64 @llvm.bitreverse.i64(i64) readnone
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
new file mode 100644
index 000000000000..34bb2588ad62
--- /dev/null
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -0,0 +1,858 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %and = and i32 %a, 1234567
+    store volatile i32 %and, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %and = and i32 %a, 1234567
+    store i32 %and, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %shl = shl i32 %a, 12
+    store volatile i32 %shl, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %shl = shl i32 %a, 12
+    store i32 %shl, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %ashr = ashr i32 %a, 12
+    store volatile i32 %ashr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %ashr = ashr i32 %a, 12
+    store i32 %ashr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %lshr = lshr i32 %a, 12
+    store volatile i32 %lshr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %lshr = lshr i32 %a, 12
+    store i32 %lshr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone }
+
+...
+---
+
+# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
+# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %10,
+name:            s_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_64_xexec }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_128 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %2 = COPY %1.sub1
+    %3 = COPY %1.sub0
+    %4 = S_MOV_B32 61440
+    %5 = S_MOV_B32 -1
+    %6 = REG_SEQUENCE killed %2, 1, killed %3, 2, killed %4, 3, killed %5, 4
+    %7 = S_MOV_B32 1234567
+    %8 = S_MOV_B32 9999
+    %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
+    %10 = COPY %9
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_and_imm_regimm_32{{$}}
+
+# GCN: %9 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %9,
+
+# GCN: %10 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %10
+
+# GCN: %11 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %11,
+
+# GCN: %12 = V_MOV_B32_e32 1234567, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %12,
+
+# GCN: %13 = V_MOV_B32_e32 63, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %13,
+
+name:            v_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 20, class: sreg_32_xm0 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vgpr_32 }
+  - { id: 32, class: vreg_64 }
+  - { id: 33, class: vreg_64 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+  - { id: 36, class: vgpr_32 }
+  - { id: 37, class: vreg_64 }
+  - { id: 44, class: vgpr_32 }
+
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
+    %32 = REG_SEQUENCE %3, 1, %31, 2
+    %33 = V_LSHLREV_B64 2, killed %32, implicit %exec
+    %20 = COPY %4.sub1
+    %44 = V_ADD_I32_e32 %4.sub0, %33.sub0, implicit-def %vcc, implicit %exec
+    %36 = COPY killed %20
+    %35 = V_ADDC_U32_e32 %33.sub1, %36, implicit-def %vcc, implicit %vcc, implicit %exec
+    %37 = REG_SEQUENCE %44, 1, killed %35, 2
+    %24 = V_MOV_B32_e32 982, implicit %exec
+    %26 = S_MOV_B32 1234567
+    %34 = V_MOV_B32_e32 63, implicit %exec
+
+    %27 = V_AND_B32_e64 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_AND_B32_e64 %24, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %29 = V_AND_B32_e32 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %30 = V_AND_B32_e64 %26, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %31 = V_AND_B32_e64 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_shl_imm_regimm_32{{$}}
+# GC1: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %13,
+
+name:            s_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 1
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_fold_shl_imm_regimm_32{{$}}
+
+# GCN: %11 = V_MOV_B32_e32 40955904, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 0, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 2, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 7927808, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -8, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 9999, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1, implicit %exec
+    %7 = S_MOV_B32 1
+    %27 = S_MOV_B32 -4
+
+    %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHL_B32_e64 12, %7, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHL_B32_e64 12, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 243, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: v_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_ASHR_I32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 1048332, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 -999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 1073741823, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHR_B32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
new file mode 100644
index 000000000000..b74bce76f79c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -0,0 +1,262 @@
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+
+; FIXME: This should also fold when fma is actually fast if an FMA
+; exists in the original program.
+
+; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+define void @fast_sub_fmuladd_fmul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %z, %fma
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  store volatile float %fma, float addrspace(1)* undef
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  store volatile float %fma, float addrspace(1)* undef
+  %add = fadd fast float %z, %fma
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
+; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
+
+; GCN: buffer_store_dword [[MUL]]
+; GCN: buffer_store_dword [[MAD]]
+define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
+; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32
+define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %fma, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 9df336c2c489..10acae092e9f 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -18,9 +18,9 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
-; VI: v_add_f32_e32
-; VI: v_mul_f32_e32
-; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
@@ -113,9 +113,9 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; VI: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
-; VI: v_add_f16_e32
-; VI: v_mul_f16_e32
-; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
new file mode 100644
index 000000000000..d555d8d871de
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -0,0 +1,1282 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+
+; --------------------------------------------------------------------------------
+; fadd tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %add, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %add, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %fneg.a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = fmul float %mul, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = fmul float %a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = fmul float %fneg.a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fma tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fma, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fma, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.c = fsub float -0.000000e+00, %c
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.c = fsub float -0.000000e+00, %c
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fneg.a, %d
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmad tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_C]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fma, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_extend tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store double %fneg, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpext = fpext float %fneg.a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store double %fneg, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+; GCN: buffer_store_dword [[FNEG_A]]
+define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpext = fpext float %fneg.a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
+define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile double %fpext, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  %mul = fmul double %fpext, 4.0
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile double %mul, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Source modifiers not folded for f16->f32
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile half, half addrspace(1)* %a.gep
+  %fpext = fpext half %a to float
+  %fneg = fsub float -0.000000e+00, %fpext
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fpext, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile half, half addrspace(1)* %a.gep
+  %fpext = fpext half %a to float
+  %fneg = fsub float -0.000000e+00, %fpext
+  %mul = fmul float %fpext, 4.0
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_round tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fpround = fptrunc double %a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
+; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}}
+define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile double %fneg.a, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  %use1 = fmul double %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile double %use1, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpround = fptrunc float %a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store half %fneg, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store half %fneg, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
+; GCN: buffer_store_dword [[NEG]]
+; GCN: buffer_store_dword [[CVT]]
+define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fpround = fptrunc double %a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fpround, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store volatile half %fneg, half addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  %use1 = fmul float %fneg.a, %c
+  store volatile half %fneg, half addrspace(1)* %out.gep
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; sin tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[K]], -[[A]]
+; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
+; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %sin = call float @llvm.sin.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %sin
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %sin = call float @llvm.amdgcn.sin.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %sin
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.sin.f32(float) #1
+
+declare float @llvm.amdgcn.sin.f32(float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll
deleted file mode 100644
index 5a79ca82bc29..000000000000
--- a/test/CodeGen/AMDGPU/fp16_to_fp.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
-declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
-; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
-  store double %cvt, double addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
new file mode 100644
index 000000000000..35e9541692db
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp32:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[RESULT]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RES:T[0-9]+\.[XYZW]]]
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
+; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
+; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
new file mode 100644
index 000000000000..8b05d7b88a10
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp64:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 67925ebd82b6..346ad822f293 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,12 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 
-; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[RESULT]]
+; FUNC-LABEL: {{^}}test_convert_fp32_to_fp16:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_short [[RESULT]]
+
+; EG: MEM_RAT MSKOR
+; EG: VTX_READ_32
+; EG: FLT32_TO_FLT16
 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7351665f06e4..2c538b16e743 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -207,11 +207,15 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 
+; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
+; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
+; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
+
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: s_waitcnt
 
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index d49fa2bf48a7..2ef045dbb8eb 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -7,11 +7,14 @@
 ;
 ; CHECK-LABEL: {{^}}main:
 
+; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
 ; TODO: add 0?
-; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
+; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
 
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index 0e6281940c24..d141281f36b8 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -273,12 +273,12 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
 define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -306,15 +306,15 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
 
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
 
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
new file mode 100644
index 000000000000..559d464f36a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
new file mode 100644
index 000000000000..d9d311cd032b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -0,0 +1,840 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add = fadd float %select, %z
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
+define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fabs.x, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 [[ADD:v[0-9]+]], |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[X_ABS]]
+define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %fabs.x, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
+define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fabs.y, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_var_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs.x, float %y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_fabs_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs, float -1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; FIXME: fabs should fold away
+; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
+define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %add = fadd float %fabs, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_posk_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 2.0, float 1.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float -1.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float -1024.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_fabs_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs, float 1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_posk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float 1.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
+define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fneg.x, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[NEG_X]]
+define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %fneg.x, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
+define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fneg.y, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fneg_var_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float %y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float -1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 0x3FC45F3060000000
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
+
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 0xBFC45F3060000000
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_negliteralk_f32:
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0xc5000000
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2048.0, float -4096.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
+define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %fneg.x = fsub float -0.0, %select
+  %add = fadd float %fneg.x, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_negk_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float -1.0, float %fneg.x
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_posk_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float 1.0, float %fneg.x
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fneg.fabs.x, float %fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+  %select = select i1 %cmp, float %fabs.x, float %fneg.fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_neg_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.000000e+00, %x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fneg.x, float %fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_neg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.y = fsub float -0.000000e+00, %y
+  %select = select i1 %cmp, float %fabs.x, float %fneg.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.000000e+00, %x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %fneg.y = fsub float -0.000000e+00, %y
+  %select = select i1 %cmp, float %fneg.y, float %fneg.fabs.x
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_eq_u32_e64 vcc,
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float %fneg.fabs.x, float 4.0
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_posk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float 4.0, float %fneg.fabs.x
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float %fneg.fabs.x, float -4.0
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float -4.0, float %fneg.fabs.x
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_add_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fadd float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_sub_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fsub float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = fmul float %x, 4.0
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fma_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fma = call float @llvm.fma.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fma
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fmad_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fmad = call float @llvm.fmuladd.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fmad
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; FIXME: This one should fold to rcp
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
new file mode 100644
index 000000000000..ad358d33c405
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -0,0 +1,161 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure to test with f32 and i32 compares. If we have to use float
+; compares, we always have multiple condition registers. If we can do
+; scalar compares, we don't want to use multiple condition registers.
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %and = and i1 %icmp0, %icmp1
+  %select = select i1 %and, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %and = and i1 %fcmp0, %fcmp1
+  %select = select i1 %and, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %and = and i1 %icmp0, %icmp1
+  %select = select i1 %and, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %and = and i1 %fcmp0, %fcmp1
+  %select = select i1 %and, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
+define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %or = or i1 %icmp0, %icmp1
+  %select = select i1 %or, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %or = or i1 %fcmp0, %fcmp1
+  %select = select i1 %or, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %or = or i1 %icmp0, %icmp1
+  %select = select i1 %or, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %or = or i1 %fcmp0, %fcmp1
+  %select = select i1 %or, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}regression:
+; GCN: v_cmp_neq_f32_e64 vcc
+; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+
+define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+entry:
+  %cmp0 = fcmp oeq float %c0, 1.0
+  br i1 %cmp0, label %if0, label %endif
+
+if0:
+  %cmp1 = fcmp oeq float %c1, 0.0
+  br i1 %cmp1, label %if1, label %endif
+
+if1:
+  %cmp2 = xor i1 %cmp1, true
+  br label %endif
+
+endif:
+  %tmp0 = phi i1 [ true, %entry ], [ %cmp2, %if1 ], [ false, %if0 ]
+  %tmp2 = select i1 %tmp0, float 4.0, float 0.0
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index f9216d311471..84f9b7bb8064 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+; FIXME: i16 promotion pass ruins the scalar cases when legal.
+
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
 ; GCN: s_load_dword [[ARG:s[0-9]+]],
 ; GCN: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
@@ -659,6 +661,137 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(2)* %ptr
+  %in = trunc i32 %ld to i16
+  %shl = shl i16 %in, 15
+  %sext = ashr i16 %shl, 15
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(2)* %ptr
+  %in = trunc i32 %ld to i16
+  %shl = shl i16 %in, 14
+  %sext = ashr i16 %shl, 14
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16:
+; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
+
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
+  %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+
+  %in = load i16, i16 addrspace(1)* %gep
+  %shl = shl i16 %in, 15
+  %sext = ashr i16 %shl, 15
+  store i16 %sext, i16 addrspace(3)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16_nonload:
+; GCN: {{buffer|flat}}_load_ushort [[VAL0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]]
+
+; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %a.gep, align 2
+  %b = load volatile i16, i16 addrspace(1)* %b.gep, align 2
+
+  %c = shl i16 %a, %b
+  %shl = shl i16 %c, 15
+  %ashr = ashr i16 %shl, 15
+
+  store i16 %ashr, i16 addrspace(3)* %out.gep, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 14
+  %sext = ashr i16 %shl, 14
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i8_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_sext_i32_i8 [[SSEXT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 8
+  %sext = ashr i16 %shl, 8
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i15_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0xf0000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 1
+  %sext = ashr i16 %shl, 1
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
new file mode 100644
index 000000000000..1988a14b5845
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -0,0 +1,597 @@
+# RUN: llc -verify-machineinstrs -march=amdgcn -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+# Check that add with carry out isn't incorrectly reduced to e32 when
+# the carry out is a virtual register.
+
+# TODO: We should run this test until the end of codegen to make sure
+# that the post-RA run does manage to shrink it, but right now the
+# resume crashes
+
+--- |
+  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = sub i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = sub i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone }
+
+...
+---
+# GCN-LABEL: name: shrink_add_vop3{{$}}
+# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+name:            shrink_add_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_sub_vop3{{$}}
+# GCN: %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name:            shrink_sub_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_subrev_vop3{{$}}
+# GCN: %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name:            shrink_subrev_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: check_addc_src2_vop3{{$}}
+# GCN: %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name: check_addc_src2_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %9 = S_MOV_B64 0
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_addc_vop3{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+
+name:            shrink_addc_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %vcc = S_MOV_B64 0
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+
+---
+# GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name:            shrink_addc_undef_vcc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 027c63817903..16aed5928b0a 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -212,5 +212,71 @@ entry:
   ret void
 }
 
+; Without special casing the inline constant check for v_mac_f32's
+; src2, this fails to fold the 1.0 into a mad.
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
+  %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
+  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %tmp = load volatile float, float addrspace(1)* %gep.a
+  %tmp1 = load volatile float, float addrspace(1)* %gep.b
+  %tmp2 = fadd float %tmp, %tmp
+  %tmp3 = fmul float %tmp2, 4.0
+  %tmp4 = fsub float 1.0, %tmp3
+  %tmp5 = fadd float %tmp4, %tmp1
+  %tmp6 = fadd float %tmp1, %tmp1
+  %tmp7 = fmul float %tmp6, %tmp
+  %tmp8 = fsub float 1.0, %tmp7
+  %tmp9 = fmul float %tmp8, 8.0
+  %tmp10 = fadd float %tmp5, %tmp9
+  store float %tmp10, float addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
+
+; FIXME: How is this not folded?
+; SI: v_cvt_f32_f16_e32 v{{[0-9]+}}, 0x3c00
+
+; VI: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; VI: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
+  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+  %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %tmp = load volatile half, half addrspace(1)* %gep.a
+  %tmp1 = load volatile half, half addrspace(1)* %gep.b
+  %tmp2 = fadd half %tmp, %tmp
+  %tmp3 = fmul half %tmp2, 4.0
+  %tmp4 = fsub half 1.0, %tmp3
+  %tmp5 = fadd half %tmp4, %tmp1
+  %tmp6 = fadd half %tmp1, %tmp1
+  %tmp7 = fmul half %tmp6, %tmp
+  %tmp8 = fsub half 1.0, %tmp7
+  %tmp9 = fmul half %tmp8, 8.0
+  %tmp10 = fadd half %tmp5, %tmp9
+  store half %tmp10, half addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
 attributes #0 = { nounwind "unsafe-fp-math"="false" }
 attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index ebc5934df022..824123687287 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -825,7 +825,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) #0 {
 ; CHECK-ALL: strh
 ; CHECK-ALL: mov
 ; CHECK-ALL-DAG: ldrh
-; CHECK-ALL-DAG: add
+; CHECK-ALL-DAG: orr
 ; CHECK-ALL: strh
 ; CHECK-ALL: ldrh
 ; CHECK-ALL: strh
@@ -855,7 +855,7 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
 ; CHECK-VFP: orr
 ; CHECK-VFP: str
 ; CHECK-VFP: mov
-; CHECK-VFP: add
+; CHECK-VFP: orr
 ; CHECK-VFP: ldrh
 ; CHECK-VFP: strh
 ; CHECK-VFP: add sp, sp, #8
diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll
index ba14140cdc44..c1696c9be1b7 100644
--- a/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -9,7 +9,11 @@ entry:
 }
 
 ; CHECK-ARMv4-LABEL: f7:
-; CHECK-ARMv4: moveq r6, #1
+; CHECK-ARMv4-DAG: bl ___eqsf2
+; CHECK-ARMv4-DAG: bl ___unordsf2
+; CHECK-ARMv4: cmp r0, #0
+; CHECK-ARMv4: movne r0, #1
+; CHECK-ARMv4: orrs r0, r0,
 ; CHECK-ARMv4: moveq r0, #42
 
 ; CHECK-ARMv7-LABEL: f7:
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 25c4807d9862..b7693c797635 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -373,7 +373,8 @@ define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
 ; CHECK: mov r[[FP:[0-9]+]], sp
 ; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
 ; CHECK: mov r[[SPCOPY:[0-9]+]], sp
-; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]]
+; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15
+; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]]
 ; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
   %x = extractelement <16 x i8> %v, i32 %idx
   %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index 269223ac9f38..1aa23597cf49 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -214,14 +214,11 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
 }
 
 ; Combine vuzp+vadd->vpadd.
-; FIXME: Implement this optimization
-define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADD:
+define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i8:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vorr d18, d17, d17
-; CHECK-NEXT:    vuzp.8 d16, d18
-; CHECK-NEXT:    vadd.i8 d16, d18, d16
+; CHECK-NEXT:    vpadd.i8 d16, d16, d17
 ; CHECK-NEXT:    vstr d16, [r1]
 ; CHECK-NEXT:    mov pc, lr
   %tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -233,15 +230,44 @@ define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
   ret void
 }
 
+; Combine vuzp+vadd->vpadd.
+define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpadd.i16 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %add = add <4 x i16> %tmp3, %tmp1
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vadd->vpadd.
+define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpadd.i32 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %add = add <2 x i32> %tmp3, %tmp1
+  store <2 x i32> %add, <2 x i32>* %X, align 8
+  ret void
+}
+
 ; Combine vuzp+vaddl->vpaddl
-; FIXME: Implement this optimization.
-define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADDL_sext:
+define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s8:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vorr d18, d17, d17
-; CHECK-NEXT:    vuzp.8 d16, d18
-; CHECK-NEXT:    vaddl.s8 q8, d18, d16
+; CHECK-NEXT:    vpaddl.s8 q8, q8
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
 ; CHECK-NEXT:    mov pc, lr
   %tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -254,10 +280,200 @@ define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind s
   ret void
 }
 
-; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
-; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
-define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
-; CHECK-LABEL: fromExtendingExtractVectorElt:
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffles.
+define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_s8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i16 d18, #0x8
+; CHECK-NEXT:    vneg.s16 d18, d18
+; CHECK-NEXT:    vext.8 d19, d16, d16, #1
+; CHECK-NEXT:    vshl.i16 d16, d16, #8
+; CHECK-NEXT:    vshl.i16 d17, d19, #8
+; CHECK-NEXT:    vshl.s16 d16, d16, d18
+; CHECK-NEXT:    vshl.s16 d17, d17, d18
+; CHECK-NEXT:    vadd.i16 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
+  %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
+  %add = add <4 x i16> %tmp4, %tmp5
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
+  %add = add <8 x i16> %tmp4, %tmp5
+  store <8 x i16> %add, <8 x i16>* %X, align 8
+  ret void
+}
+
+; In theory, it's possible to match this to vpaddl, but rearranging the
+; shuffle is awkward, so this doesn't match at the moment.
+define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmovl.u8 q9, d17
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vuzp.16 q8, q9
+; CHECK-NEXT:    vadd.i16 q8, q8, q9
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add <8 x i16> %tmp2, %tmp3
+  store <8 x i16> %add, <8 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffle.
+define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vext.8 d18, d16, d16, #1
+; CHECK-NEXT:    vbic.i16 d16, #0xff00
+; CHECK-NEXT:    vbic.i16 d18, #0xff00
+; CHECK-NEXT:    vadd.i16 d16, d18, d16
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+  %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
+  %add = add <4 x i16> %tmp4, %tmp5
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Matching to vpaddl.8 requires matching shuffle(zext()).
+define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vpadd.i16 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %add = add <4 x i16> %tmp2, %tmp3
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.s16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
+  %add = add <4 x i32> %tmp4, %tmp5
+  store <4 x i32> %add, <4 x i32>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
+  %add = add <4 x i32> %tmp4, %tmp5
+  store <4 x i32> %add, <4 x i32>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.s32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
+  %add = add <2 x i64> %tmp4, %tmp5
+  store <2 x i64> %add, <2 x i64>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
+  %add = add <2 x i64> %tmp4, %tmp5
+  store <2 x i64> %add, <2 x i64>* %X, align 8
+  ret void
+}
+
+; Legalization promotes the <4 x i8> to <4 x i16>.
+define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vpaddl.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %x = add <4 x i8> %tmp2, %tmp1
+  ret <4 x i8> %x
+}
+
+; Legalization promotes the <2 x i16> to <2 x i32>.
+define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vmov d16, r0, r1
 ; CHECK-NEXT:    vpaddl.s16 d16, d16
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
index 36731e933bab..df6336043fdf 100644
--- a/test/CodeGen/ARM/vtrn.ll
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -70,14 +70,14 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK-NEXT:    vldr d16, [r1]
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vtrn.32 d17, d16
-; CHECK-NEXT:    vadd.i32 d16, d17, d16
+; CHECK-NEXT:    vmul.i32 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
 	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        %tmp5 = mul <2 x i32> %tmp3, %tmp4
 	ret <2 x i32> %tmp5
 }
 
diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll
index 1e1b02df99a2..3c7df4a5e99f 100644
--- a/test/CodeGen/Mips/llvm-ir/extractelement.ll
+++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll
@@ -14,6 +14,7 @@ define i1 @via_stack_bug(i8 signext %idx) {
 ; ALL-DAG:       addiu  [[ONE:\$[0-9]+]], $zero, 1
 ; ALL-DAG:       sb     [[ONE]], 7($sp)
 ; ALL-DAG:       sb     $zero, 6($sp)
+; ALL-DAG:       andi   [[MASKED_IDX:\$[0-9]+]], $4, 1
 ; ALL-DAG:       addiu  [[VPTR:\$[0-9]+]], $sp, 6
-; ALL-DAG:       addu   [[EPTR:\$[0-9]+]], $4, [[VPTR]]
+; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]]
 ; ALL:           lbu    $2, 0([[EPTR]])
diff --git a/test/CodeGen/Mips/msa/immediates-bad.ll b/test/CodeGen/Mips/msa/immediates-bad.ll
new file mode 100644
index 000000000000..c6b8fcef649a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates-bad.ll
@@ -0,0 +1,1681 @@
+; RUN: not llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s 2> %t1
+; RUN: FileCheck %s < %t1
+
+; Test that the immediate intrinsics with out of range values trigger an error.
+
+
+define void @binsli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+; CHECK: LLVM ERROR: Immediate out of range
+
+define void @binsri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 9)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 152)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 129)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 1025)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 1024)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 16)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 17)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 1024)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 17)
+  ret i32 %r
+}
+
+
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 9)
+  ret i32 %r
+}
+
+
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 5)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 16)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 9)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 5)
+  ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 3)
+  ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 3)
+  ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 64)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 1024)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}; Negative numbers
+
+
+define void @neg_addvi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_andi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bmnzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bmzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 -5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 -120)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 -35)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_b(<16 x i8> * %ptr) {
+entry:
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_nori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 -7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_addvi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 -140)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 -150)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_w(<4 x i32> * %ptr) {
+entry:
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 -300)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 -20)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_addvi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 -8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 -8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 -15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 -14)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 -15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 -150)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 -2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_h(<8 x i16> * %ptr) {
+entry:
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 -300)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define i32 @neg_copy_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 -1)
+  ret i32 %r
+}
+
+
+define i32 @neg_copy_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 -1)
+  ret i32 %r
+}
+
+
+define i32 @neg_copy_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 -1)
+  ret i32 %r
+}
+
+define i64 @neg_copy_s_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 -1)
+  ret i64 %r
+}
+
+define i64 @neg_copy_u_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 -1)
+  ret i64 %r
+}
+
+define void @neg_addvi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 -45)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 -32)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_d(<2 x i64> * %ptr) {
+entry:
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 -202)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 -2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 -202)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 -2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 -1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
diff --git a/test/CodeGen/Mips/msa/immediates.ll b/test/CodeGen/Mips/msa/immediates.ll
new file mode 100644
index 000000000000..b561ace30a8a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates.ll
@@ -0,0 +1,1276 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,MSA32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n32 < %s \
+; RUN:      | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n64 < %s \
+; RUN:      | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N64
+
+; Test that the immediate intrinsics don't crash LLVM.
+
+; Some of the intrinsics lower to equivalent forms.
+
+define void @addvi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_b:
+; CHECK: addvi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @andi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: andi_b:
+; CHECK: andi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_b:
+; CHECK: andi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_b:
+; CHECK: binsli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %b, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_b:
+; CHECK: binsri.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %b, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmnzi_b:
+; CHECK: bmnzi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmzi_b:
+; CHECK: bmnzi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_b:
+; CHECK: bnegi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseli_b:
+; CHECK: bseli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_b:
+; CHECK: bseti.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_b:
+; CHECK: clei_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 12)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_b:
+; CHECK: clei_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_b:
+; CHECK: clti_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 15)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_b:
+; CHECK: clti_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_b:
+; CHECK: ldi.b
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_b:
+; CHECK: maxi_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_b:
+; CHECK: maxi_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_b:
+; CHECK: mini_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_b:
+; CHECK: mini_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: nori_b:
+; CHECK: nori.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ori_b:
+; CHECK: ori.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_b:
+; CHECK: sldi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: slli_b:
+; CHECK: slli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: splati_b:
+; CHECK: splati.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srai_b:
+; CHECK: srai.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srari_b:
+; CHECK: srari.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srli_b:
+; CHECK: srli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_b:
+; CHECK: srlri.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ld_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_b
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK:    ld.b $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 %offset)
+  store <16 x i8> %a, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @st_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_b
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.b $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.b(<16 x i8> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_w:
+; CHECK: addvi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_w:
+; CHECK: bclri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_w:
+; CHECK: binsli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_w:
+; CHECK: binsri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_w:
+; CHECK: bnegi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_w:
+; CHECK: bseti.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_w:
+; CHECK: clei_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 14)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_w:
+; CHECK: clei_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_w:
+; CHECK: clti_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 15)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_w:
+; CHECK: clti_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_w:
+; CHECK: maxi_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_w:
+; CHECK: maxi_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_w:
+; CHECK: mini_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_w:
+; CHECK: mini_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_w:
+; CHECK: ldi.w
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_w:
+; CHECK: sldi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: slli_w:
+; CHECK: slli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: splati_w:
+; CHECK: splati.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srai_w:
+; CHECK: srai.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srari_w:
+; CHECK: srari.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srli_w:
+; CHECK: srli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_w:
+; CHECK: srlri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ld_w(<4 x i32> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_w
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.w $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 %offset)
+  store <4 x i32> %a, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @st_w(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_w
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.w $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.w(<4 x i32> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_h:
+; CHECK: addvi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_h:
+; CHECK: bclri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_h:
+; CHECK: binsli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %b, i32 8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_h:
+; CHECK: binsri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %b, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_h:
+; CHECK: bnegi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 14)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_h:
+; CHECK: bseti.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_h:
+; CHECK: clei_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 13)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_h:
+; CHECK: clei_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_h:
+; CHECK: clti_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_h:
+; CHECK: clti_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_h:
+; CHECK: maxi_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_h:
+; CHECK: maxi_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_h:
+; CHECK: mini_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_h:
+; CHECK: mini_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_h:
+; CHECK: ldi.h
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_h:
+; CHECK: sldi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: slli_h:
+; CHECK: slli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: splati_h:
+; CHECK: splati.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srai_h:
+; CHECK: srai.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srari_h:
+; CHECK: srari.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srli_h:
+; CHECK: srli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_h:
+; CHECK: srlri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ld_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_h
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK:    ld.h $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 %offset)
+  store <8 x i16> %a, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @st_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_h
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.h $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.h(<8 x i16> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_b:
+; CHECK: copy_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_h:
+; CHECK: copy_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_w:
+; CHECK: copy_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_b:
+; CHECK: copy_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_h:
+; CHECK: copy_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_w:
+; MSA32: copy_s.w
+; MSA64: copy_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 1)
+  ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 1)
+  ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 1)
+  ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_d:
+; CHECK: addvi.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_d:
+; CHECK: and.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 16)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_d:
+; CHECK: bsel.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %b, i32 4)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_d:
+; CHECK: binsri.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %b, i32 5)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_d:
+; CHECK: xor.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 9)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_d:
+; CHECK: or.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_d:
+; CHECK: clei_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 15)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_d:
+; CHECK: clei_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_d:
+; CHECK: clti_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 15)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_d:
+; CHECK: clti_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_d:
+; CHECK: ldi.d
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_d:
+; CHECK: maxi_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_d:
+; CHECK: maxi_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_d:
+; CHECK: mini_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_d:
+; CHECK: mini_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_d:
+; CHECK: sldi.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: slli_d:
+; CHECK: slli.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srai_d:
+; CHECK: srai.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srari_d:
+; CHECK: srari.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srli_d:
+; CHECK: srli.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_d:
+; CHECK: srlri.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ld_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_d
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 %offset)
+  store <2 x i64> %a, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ld_d2(<2 x i64> * %ptr, i8 * %ldptr) {
+entry:
+; CHECK-LABEL: ld_d2
+; MSA32: addiu $[[R0:[0-9]]], $5, 4096
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addiu $[[R0:[0-9]]], $[[R1]], 4096
+; MSA64N64: daddiu $[[R0:[0-9]]], $5, 4096
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 4096)
+  store <2 x i64> %a, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @st_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_d
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.d(<2 x i64> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ld.b(i8*, i32)
+declare <8 x i16> @llvm.mips.ld.h(i8*, i32)
+declare <4 x i32> @llvm.mips.ld.w(i8*, i32)
+declare <2 x i64> @llvm.mips.ld.d(i8*, i32)
+declare void @llvm.mips.st.b(<16 x i8>, i8*, i32)
+declare void @llvm.mips.st.h(<8 x i16>, i8*, i32)
+declare void @llvm.mips.st.w(<4 x i32>, i8*, i32)
+declare void @llvm.mips.st.d(<2 x i64>, i8*, i32)
diff --git a/test/CodeGen/Mips/msa/msa-nooddspreg.ll b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
new file mode 100644
index 000000000000..7cfc66650e6b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mips -mcpu=mips32r5 -mattr=+fp64,+msa,+nooddspreg < %s | FileCheck %s
+
+; Test that the register allocator honours +nooddspreg and does not pick an odd
+; single precision subregister of an MSA register.
+
+@f1 = external global float
+
+@f2 = external global float
+
+@v3 = external global <4 x float>
+
+@d1 = external global double
+
+define void @test() {
+; CHECK-LABEL: test:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %0 = load float, float * @f1
+  %1 = insertelement <4 x float> undef,    float %0, i32 0
+  %2 = insertelement <4 x float> %1,    float %0, i32 1
+  %3 = insertelement <4 x float> %2,    float %0, i32 2
+  %4 = insertelement <4 x float> %3,    float %0, i32 3
+
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %5 = load float, float * @f2
+  %6 = insertelement <4 x float> undef,    float %5, i32 0
+  %7 = insertelement <4 x float> %6,    float %5, i32 1
+  %8 = insertelement <4 x float> %7,    float %5, i32 2
+  %9 = insertelement <4 x float> %8,    float %5, i32 3
+
+  %10 = fadd <4 x float> %4, %9
+  store <4 x float> %10, <4 x float> * @v3
+  ret void
+}
+
+; Test that the register allocator hnours +noodspreg and does not pick an odd
+; single precision register for a load to perform a conversion to a double.
+
+define void @test2() {
+; CHECK-LABEL: test2:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %0 = load float, float * @f1
+  %1 = fpext float %0 to double
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %2 = load float, float * @f2
+  %3 = fpext float %2 to double
+  %4 = fadd double %1, %3
+  store double%4, double * @d1
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll
index 9da26adc1511..d0a333d369ca 100644
--- a/test/CodeGen/NVPTX/fast-math.ll
+++ b/test/CodeGen/NVPTX/fast-math.ll
@@ -1,10 +1,8 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
-
 declare float @llvm.nvvm.sqrt.f(float)
 
-
-; CHECK: sqrt_div
+; CHECK-LABEL: sqrt_div
 ; CHECK: sqrt.rn.f32
 ; CHECK: div.rn.f32
 define float @sqrt_div(float %a, float %b) {
@@ -13,7 +11,7 @@ define float @sqrt_div(float %a, float %b) {
   ret float %t2
 }
 
-; CHECK: sqrt_div_fast
+; CHECK-LABEL: sqrt_div_fast
 ; CHECK: sqrt.approx.f32
 ; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
@@ -22,22 +20,19 @@ define float @sqrt_div_fast(float %a, float %b) #0 {
   ret float %t2
 }
 
-
-; CHECK: fadd
-; CHECK: add.f32
+; CHECK-LABEL: fadd
+; CHECK: add.rn.f32
 define float @fadd(float %a, float %b) {
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-; CHECK: fadd_ftz
-; CHECK: add.ftz.f32
+; CHECK-LABEL: fadd_ftz
+; CHECK: add.rn.ftz.f32
 define float @fadd_ftz(float %a, float %b) #1 {
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-
-
 attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "nvptx-f32ftz" = "true" }
diff --git a/test/CodeGen/PowerPC/change-no-infs.ll b/test/CodeGen/PowerPC/change-no-infs.ll
new file mode 100644
index 000000000000..0cd5eb5408e3
--- /dev/null
+++ b/test/CodeGen/PowerPC/change-no-infs.ll
@@ -0,0 +1,67 @@
+; Check that we can enable/disable NoInfsFPMath and NoNaNsInFPMath via function
+; attributes.  An attribute on one function should not magically apply to the
+; next one.
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN:   -enable-no-infs-fp-math -enable-no-nans-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The fcmp+select in these functions should be converted to a fsel instruction
+; when both NoInfsFPMath and NoNaNsInFPMath are enabled.
+
+; CHECK-LABEL: default0:
+define double @default0(double %a, double %y, double %z) {
+entry:
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_off:
+define double @unsafe_math_off(double %a, double %y, double %z) #0 #2 {
+entry:
+; SAFE-NOT:   fsel
+; UNSAFE-NOT: fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: default1:
+define double @default1(double %a, double %y, double %z) {
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_on:
+define double @unsafe_math_on(double %a, double %y, double %z) #1 #3 {
+entry:
+; SAFE-NOT:   fsel
+; UNSAFE-NOT: fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: default2:
+define double @default2(double %a, double %y, double %z) {
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+attributes #0 = { "no-infs-fp-math"="false" }
+attributes #1 = { "no-nans-fp-math"="false" }
+
+attributes #2 = { "no-infs-fp-math"="false" }
+attributes #3 = { "no-infs-fp-math"="true" }
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index b61acab7f7cb..98862cd049a5 100644
--- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: mfvsrd [[TOGPR:[0-9]+]],
 ; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
 ; CHECK: extsw 3, [[RSHREG]]
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
 ; CHECK-P7: lwax 3, [[ELEMOFFREG]],
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
@@ -52,7 +52,7 @@ entry:
 ; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
 ; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
 ; CHECK: mfvsrd 3,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28
 ; CHECK-P7-DAG: stxvd2x 34,
 ; CHECK-P7: ldx 3, [[ELEMOFFREG]],
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
@@ -75,7 +75,7 @@ entry:
 ; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
 ; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
 ; CHECK: xscvspdpn 1,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
 ; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
 ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 49980da6eb8f..e4f8f3fb6ca9 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -7,11 +7,18 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: test:
 ; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
+; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
 ; CHECK-NEXT: call        .Lbitcast.1@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0
 ; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const   $push[[L1:[0-9]+]]=, 0
+; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L1]]{{$}}
+; CHECK-NEXT: i32.const   $push[[L2:[0-9]+]]=, 0
+; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L2]]{{$}}
+; CHECK-NEXT: call        foo0@FUNCTION
 ; CHECK-NEXT: i32.call    $drop=, .Lbitcast.3@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo2@FUNCTION{{$}}
+; CHECK-NEXT: call        foo1@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo3@FUNCTION{{$}}
 ; CHECK-NEXT: .endfunc
 
@@ -47,10 +54,19 @@ declare void @foo3()
 define void @test() {
 entry:
   call void bitcast (void (i32)* @has_i32_arg to void ()*)()
+  call void bitcast (void (i32)* @has_i32_arg to void ()*)()
   call void bitcast (i32 ()* @has_i32_ret to void ()*)()
   call void bitcast (void ()* @foo0 to void (i32)*)(i32 0)
+  %p = bitcast void ()* @foo0 to void (i32)*
+  call void %p(i32 0)
+  %q = bitcast void ()* @foo0 to void (i32)*
+  call void %q(i32 0)
+  %r = bitcast void (i32)* %q to void ()*
+  call void %r()
   %t = call i32 bitcast (void ()* @foo1 to i32 ()*)()
   call void bitcast (void ()* @foo2 to void ()*)()
+  call void @foo1()
   call void @foo3()
+
   ret void
 }
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
deleted file mode 100644
index 5980b7907c9f..000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
-
-; Additional tests for 64-bit divide bypass
-
-define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
-  %result = sdiv i64 %a, %b
-  ret i64 %result
-}
-
-define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
-  %result = srem i64 %a, %b
-  ret i64 %result
-}
-
-define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: divw
-; CHECK: addq
-; CHECK: ret
-; CHECK-NOT: idivq
-; CHECK-NOT: divw
-  %resultdiv = sdiv i64 %a, %b
-  %resultrem = srem i64 %a, %b
-  %result = add i64 %resultdiv, %resultrem
-  ret i64 %result
-}
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
deleted file mode 100644
index 79001e5de192..000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-
-define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
-  %result = sdiv i32 %a, %b
-  ret i32 %result
-}
-
-define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
-  %result = srem i32 %a, %b
-  ret i32 %result
-}
-
-define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
-; CHECK-NOT: idivl
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, %b
-  %resultrem = srem i32 %a, %b
-  %result = add i32 %resultdiv, %resultrem
-  ret i32 %result
-}
-
-define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_use_div_and_idiv:
-; CHECK: idivl
-; CHECK: divb
-; CHECK: divl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
-  %resultidiv = sdiv i32 %a, %b
-  %resultdiv = udiv i32 %a, %b
-  %result = add i32 %resultidiv, %resultdiv
-  ret i32 %result
-}
-
-define i32 @Test_use_div_imm_imm() nounwind {
-; CHECK-LABEL: Test_use_div_imm_imm:
-; CHECK: movl $64
-  %resultdiv = sdiv i32 256, 4
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, 33
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultrem = srem i32 %a, 33
-  ret i32 %resultrem
-}
-
-define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_divrem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, 33
-  %resultrem = srem i32 %a, 33
-  %result = add i32 %resultdiv, %resultrem
-  ret i32 %result
-}
-
-define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
-  %resultdiv = sdiv i32 4, %a
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
-  %resultdiv = sdiv i32 4, %a
-  ret i32 %resultdiv
-}
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
index dc1814b55cd3..9902325fd148 100644
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -176,4 +176,84 @@ entry:
   ret i8 %tmp2
 }
 
+define i8 @test_add_1_cmov_cmov(i64* %p, i8* %q) #0 {
+; TODO: It's possible to use "lock inc" here, but both cmovs need to be updated.
+; CHECK-LABEL: test_add_1_cmov_cmov:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    lock xaddq %rax, (%rdi)
+; CHECK-NEXT:    testq   %rax, %rax
+entry:
+  %add = atomicrmw add i64* %p, i64 1 seq_cst
+  %cmp = icmp slt i64 %add, 0
+  %s1 = select i1 %cmp, i8 12, i8 34
+  store i8 %s1, i8* %q
+  %s2 = select i1 %cmp, i8 56, i8 78
+  ret i8 %s2
+}
+
+define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_setcc_eq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lock decq (%rdi)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+  %tmp1 = icmp eq i64 %tmp0, 1
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lock addq $5, (%rdi)
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+  %tmp1 = icmp ne i64 %tmp0, -5
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne_comparand_mismatch(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne_comparand_mismatch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $5, %eax
+; CHECK-NEXT:    lock xaddq %rax, (%rdi)
+; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+  %tmp1 = icmp ne i64 %tmp0, 0
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+declare void @g()
+define zeroext i1 @test_sub_1_setcc_jcc(i64* %p) local_unnamed_addr #0 {
+; TODO: It's possible to use "lock dec" here, but both uses of the cmp need to
+; be updated.
+; CHECK-LABEL: test_sub_1_setcc_jcc:
+; CHECK:       # BB#0: # %entry
+; CHECK:	     movq $-1, %rax
+; CHECK-NEXT:  lock xaddq %rax, (%rdi)
+; CHECK-NEXT:  cmpq $1, %rax
+; CHECK-NEXT:  sete %bl
+; CHECK-NEXT:  jne
+entry:
+  %add = atomicrmw volatile add i64* %p, i64 -1 seq_cst
+  %cmp = icmp ne i64 %add, 1
+  %not = xor i1 %cmp, true
+  br i1 %cmp, label %else, label %then
+then:
+  tail call void @g()
+  br label %else
+else:
+  ret i1 %not
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index c8e806890d07..a7cd8cf23984 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -62,6 +62,17 @@ define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
   ret <8 x float> %a
 }
 
+define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fptrunc01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
 define <4 x double> @fpext00(<4 x float> %b) nounwind {
 ; CHECK-LABEL: fpext00:
 ; CHECK:       # BB#0:
@@ -71,6 +82,17 @@ define <4 x double> @fpext00(<4 x float> %b) nounwind {
   ret <4 x double> %a
 }
 
+define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fpext01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
 define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcA:
 ; CHECK:       # BB#0:
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 789ca2413940..c729b988cfb8 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -39,3 +39,29 @@ define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
   %B = trunc <16 x i16> %A to <16 x i8>
   ret <16 x i8> %B
 }
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; CHECK-LABEL: usat_trunc_wb_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+}
+
+define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
+; CHECK-LABEL: usat_trunc_dw_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %x3 = icmp ult <8 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <8 x i1> %x3, <8 x i32> %i, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <8 x i32> %x5 to <8 x i16>
+  ret <8 x i16> %x6
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 5e50a3aef2f2..87deeb9e16c0 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+
 
 define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: sitof32:
@@ -12,255 +18,304 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 }
 
 define <8 x double> @sltof864(<8 x i64> %a) {
-; KNL-LABEL: sltof864:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof864:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof864:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: sltof864:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQ-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
 
 define <4 x double> @sltof464(<4 x i64> %a) {
-; KNL-LABEL: sltof464:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof464:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof464:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof464:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2pd %ymm0, %ymm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof464:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %b
 }
 
 define <2 x float> @sltof2f32(<2 x i64> %a) {
-; KNL-LABEL: sltof2f32:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
-; KNL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof2f32:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; NODQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof2f32:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sltof2f32:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sltof2f32:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float>%b
 }
 
 define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: sltof4f32_mem:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vmovdqu (%rdi), %ymm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof4f32_mem:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vmovdqu (%rdi), %ymm0
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof4f32_mem:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof4f32_mem:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2psy (%rdi), %xmm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof4f32_mem:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovups (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a, align 8
   %b = sitofp <4 x i64> %a1 to <4 x float>
   ret <4 x float>%b
 }
 
 define <4 x i64> @f64tosl(<4 x double> %a) {
-; KNL-LABEL: f64tosl:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm0
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: f64tosl:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: f64tosl:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttpd2qq %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: f64tosl:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: f64tosl:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i64>
   ret <4 x i64> %b
 }
 
 define <4 x i64> @f32tosl(<4 x float> %a) {
-; KNL-LABEL: f32tosl:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; KNL-NEXT:    vcvttss2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; KNL-NEXT:    vcvttss2si %xmm2, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT:    vcvttss2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; KNL-NEXT:    vcvttss2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm0
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: f32tosl:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; NODQ-NEXT:    vcvttss2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; NODQ-NEXT:    vcvttss2si %xmm2, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: f32tosl:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: f32tosl:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2qq %xmm0, %ymm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: f32tosl:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x float> %a to <4 x i64>
   ret <4 x i64> %b
 }
 
 define <4 x float> @sltof432(<4 x i64> %a) {
-; KNL-LABEL: sltof432:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof432:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof432:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof432:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2ps %ymm0, %xmm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof432:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x float> @ultof432(<4 x i64> %a) {
-; KNL-LABEL: ultof432:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: ultof432:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: ultof432:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtuqq2ps %ymm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: ultof432:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: ultof432:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <8 x double> @ultof64(<8 x i64> %a) {
-; KNL-LABEL: ultof64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: ultof64:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: ultof64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: ultof64:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; DQ-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -284,33 +339,33 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
 }
 
 define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
-; KNL-LABEL: fptoui_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2udq %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; VL-NEXT:    retq
   %b = fptoui <8 x float> %a to <8 x i32>
   ret <8 x i32> %b
 }
 
 define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; KNL-LABEL: fptoui_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_128:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_128:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2udq %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_128:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
+; VL-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -325,17 +380,17 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
 }
 
 define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; KNL-LABEL: fptoui_256d:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_256d:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_256d:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_256d:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -349,34 +404,34 @@ define <8 x double> @sitof64(<8 x i32> %a) {
   ret <8 x double> %b
 }
 define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: sitof64_mask:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitof64_mask:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitof64_mask:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT:    retq
+; DQ-LABEL: sitof64_mask:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = sitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sitof64_maskz:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitof64_maskz:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitof64_maskz:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; DQ-LABEL: sitof64_maskz:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = sitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -402,19 +457,19 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
 }
 
 define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
-; KNL-LABEL: fptrunc00:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; KNL-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: fptrunc00:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; NODQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: fptrunc00:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; SKX-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: fptrunc00:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; DQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; DQ-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; DQ-NEXT:    retq
   %a = fptrunc <16 x double> %b to <16 x float>
   ret <16 x float> %a
 }
@@ -429,25 +484,36 @@ define <4 x float> @fptrunc01(<4 x double> %b) {
 }
 
 define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: fptrunc02:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
-; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptrunc02:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptrunc02:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; VL-LABEL: fptrunc02:
+; VL:       ## BB#0:
+; VL-NEXT:    vpslld $31, %xmm1, %xmm1
+; VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
   ret <4 x float> %c
 }
 
+define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fptrunc03:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
 define <8 x double> @fpext00(<8 x float> %b) nounwind {
 ; ALL-LABEL: fpext00:
 ; ALL:       ## BB#0:
@@ -458,24 +524,35 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
 }
 
 define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
-; KNL-LABEL: fpext01:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtps2pd %xmm0, %ymm0
-; KNL-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm1
-; KNL-NEXT:    vandpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: fpext01:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vcvtps2pd %xmm0, %ymm0
+; NOVL-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm1
+; NOVL-NEXT:    vandpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fpext01:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
-; SKX-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
+; VL-LABEL: fpext01:
+; VL:       ## BB#0:
+; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
+; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; VL-NEXT:    retq
   %a = fpext <4 x float> %b to <4 x double>
   %mask = fcmp ogt <4 x double>%a1, %b1
   %c = select <4 x i1>%mask,  <4 x double>%a, <4 x double>zeroinitializer
   ret <4 x double> %c
 }
 
+define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fpext02:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
 define double @funcA(i64* nocapture %e) {
 ; ALL-LABEL: funcA:
 ; ALL:       ## BB#0: ## %entry
@@ -589,53 +666,53 @@ define i32 @float_to_int(float %x) {
 }
 
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; KNL-LABEL: uitof64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; KNL-NEXT:    vmovaps %zmm2, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; NODQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; NODQ-NEXT:    vmovaps %zmm2, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; SKX-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; SKX-NEXT:    vmovaps %zmm2, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; DQ-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; DQ-NEXT:    vmovaps %zmm2, %zmm0
+; DQ-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }
 define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uitof64_mask:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64_mask:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_mask:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64_mask:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = uitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uitof64_maskz:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64_maskz:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_maskz:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64_maskz:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = uitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -643,17 +720,17 @@ define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 }
 
 define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof64_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof64_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof64_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %b
 }
@@ -668,33 +745,33 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 }
 
 define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof32_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof32_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof32_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT:    retq
   %b = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %b
 }
 
 define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof32_128:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof32_128:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof32_128:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -736,21 +813,21 @@ define double @uitofp03(i32 %a) nounwind {
 }
 
 define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
-; KNL-LABEL: sitofp_16i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitofp_16i1_float:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_16i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %zmm0
-; SKX-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: sitofp_16i1_float:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; DQ-NEXT:    vpmovm2d %k0, %zmm0
+; DQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; DQ-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = sitofp <16 x i1> %mask to <16 x float>
   ret <16 x float> %1
@@ -799,157 +876,259 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
 }
 
 define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
-; KNL-LABEL: sitofp_16i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; KNL-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_16i1_double:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NOVLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm1, %ymm1
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_16i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
+; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; VLDQ-NEXT:    vpmovm2d %k1, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_16i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; VLNODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; VLNODQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k2} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLNODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_16i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
-; SKX-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
-; SKX-NEXT:    vpmovm2d %k1, %ymm0
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT:    vpmovm2d %k0, %ymm1
-; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sitofp_16i1_double:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
   %1 = sitofp <16 x i1> %cmpres to <16 x double>
   ret <16 x double> %1
 }
 
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
-; KNL-LABEL: sitofp_8i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_8i1_double:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_8i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %ymm0
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sitofp_8i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_8i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_double:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x double>
   ret <8 x double> %1
 }
 
 define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
-; KNL-LABEL: sitofp_8i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_8i1_float:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; NOVLDQ-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_8i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_8i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vcmpltps %ymm0, %ymm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %ymm0
-; SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_8i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vcmpltps %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLNODQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_float:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x float>
   ret <8 x float> %1
 }
 
 define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
-; KNL-LABEL: sitofp_4i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_4i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_4i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_4i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_4i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
-; KNL-LABEL: sitofp_4i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_4i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_4i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_4i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_4i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x double>
   ret <4 x double> %1
 }
 
 define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
-; KNL-LABEL: sitofp_2i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_2i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
+; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_2i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sitofp_2i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_2i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x float>
   ret <2 x float> %1
 }
 
 define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
-; KNL-LABEL: sitofp_2i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_2i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_2i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2q %k0, %xmm0
+; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_2i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    vcvtqq2pd %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_2i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x double>
   ret <2 x double> %1
@@ -989,174 +1168,187 @@ define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
 }
 
 define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
-; KNL-LABEL: uitofp_16i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    movq {{.*}}(%rip), %rax
-; KNL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    kshiftrw $8, %k1, %k1
-; KNL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm1
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_16i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    movq {{.*}}(%rip), %rax
+; NOVL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    kshiftrw $8, %k1, %k1
+; NOVL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm1, %ymm1
+; NOVL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_16i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; SKX-NEXT:    movl {{.*}}(%rip), %eax
-; SKX-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT:    kshiftrw $8, %k1, %k1
-; SKX-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm1
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_16i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; VL-NEXT:    movl {{.*}}(%rip), %eax
+; VL-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT:    kshiftrw $8, %k1, %k1
+; VL-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; VL-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = uitofp <16 x i1> %mask to <16 x double>
   ret <16 x double> %1
 }
 
 define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_8i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_8i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_8i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x float>
   ret <8 x float> %1
 }
 
 define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_8i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_8i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_8i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x double>
   ret <8 x double> %1
 }
 
 define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
-; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_4i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_4i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_4i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
-; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_4i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_4i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_4i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x double>
   ret <4 x double> %1
 }
 
 define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_2i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpextrq $1, %xmm0, %rax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; NOVL-NEXT:    vmovq %xmm0, %rax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_2i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_2i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VL-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1
 }
 
 define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_2i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: uitofp_2i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLDQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_2i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: uitofp_2i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLNODQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x double>
   ret <2 x double> %1
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 42579377ef39..3f427298c177 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -179,3 +179,22 @@ define float @pr30561_f32(float %b, float %a, i1 %c) {
   %cond = select i1 %c, float %a, float %b
   ret float %cond
 }
+
+define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind {
+; CHECK-LABEL: pr31515:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm1, %zmm1
+; CHECK-NEXT:    vpslld $31, %zmm1, %zmm1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    vpandn %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %mask = and <16 x i1> %a, %b
+  %res = select <16 x i1> %mask, <16 x i16> zeroinitializer, <16 x i16> %c
+  ret <16 x i16> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 04d21ecd3e82..fb6c55b26e7c 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -505,9 +505,8 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
 define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-LABEL: usat_trunc_wb_256_mem:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -525,9 +524,8 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
 ; KNL-LABEL: usat_trunc_wb_256:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_wb_256:
@@ -607,3 +605,103 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
   ret void
 }
 
+define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
+; KNL-LABEL: usat_trunc_db_1024:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_db_1024:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    vpmovdw %zmm1, %ymm1
+; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovwb %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  ret <32 x i8> %x6
+}
+
+define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
+; KNL-LABEL: usat_trunc_db_1024_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_db_1024_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    vpmovdw %zmm1, %ymm1
+; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  store <32 x i8>%x6, <32 x i8>* %p, align 1
+  ret void
+}
+
+define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
+; ALL-LABEL: usat_trunc_dw_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <16 x i32> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
+define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
+; ALL-LABEL: usat_trunc_wb_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  ret <8 x i8>%x6
+}
+
+define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
+; KNL-LABEL: usat_trunc_qw_1024:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
+; KNL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_qw_1024:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovqd %zmm0, %ymm0
+; SKX-NEXT:    vpmovqd %zmm1, %ymm1
+; SKX-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <16 x i64> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
new file mode 100644
index 000000000000..ea545d22385c
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 32-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivl-to-divb -mtriple=i686-linux | FileCheck %s
+
+define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retl
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB2_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %ah, %edx # NOREX
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_use_div_and_idiv:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    orl %ebx, %edi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ebx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    jne .LBB3_5
+; CHECK-NEXT:    jmp .LBB3_4
+; CHECK-NEXT:  .LBB3_1:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    je .LBB3_4
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    jmp .LBB3_6
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:  .LBB3_6:
+; CHECK-NEXT:    addl %eax, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    retl
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @Test_use_div_imm_imm() nounwind {
+; CHECK-LABEL: Test_use_div_imm_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1041204193, %eax # imm = 0x3E0F83E1
+; CHECK-NEXT:    imull {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    leal (%edx,%eax), %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    imull %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    retl
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_divrem_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    imull %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_imm_reg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl $-256, %ecx
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB8_1:
+; CHECK-NEXT:    movb $4, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_imm_reg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl $-256, %ecx
+; CHECK-NEXT:    je .LBB9_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB9_1:
+; CHECK-NEXT:    movb $4, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
new file mode 100644
index 000000000000..b067f9e1503c
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 64-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivq-to-divl -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Additional tests for 64-bit divide bypass
+
+define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    retq
+  %result = sdiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %result = srem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB2_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB2_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+  %resultdiv = sdiv i64 %a, %b
+  %resultrem = srem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll
new file mode 100644
index 000000000000..b6a53130cf23
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -0,0 +1,55 @@
+; Check that a division is bypassed when appropriate only.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom       < %s | FileCheck -check-prefixes=ATOM,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=REST,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=REST,CHECK %s
+
+; Verify that div32 is bypassed only for Atoms.
+define i32 @div32(i32 %a, i32 %b) {
+entry:
+; ATOM-LABEL: div32:
+; ATOM: orl   %{{.*}}, [[REG:%[a-z]+]]
+; ATOM: testl $-256, [[REG]]
+; ATOM: divb
+;
+; REST-LABEL: div32:
+; REST-NOT: divb
+;
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+; Verify that div64 is always bypassed.
+define i64 @div64(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: div64:
+; CHECK:     orq     %{{.*}}, [[REG:%[a-z]+]]
+; CHECK:     shrq    $32, [[REG]]
+; CHECK:     divl
+;
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
+
+; Verify that no extra code is generated when optimizing for size.
+
+define i64 @div64_optsize(i64 %a, i64 %b) optsize {
+; CHECK-LABEL: div64_optsize:
+; CHECK-NOT: divl
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
+define i32 @div32_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: div32_optsize:
+; CHECK-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+define i32 @div32_minsize(i32 %a, i32 %b) minsize {
+; CHECK-LABEL: div32_minsize:
+; CHECK-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
diff --git a/test/CodeGen/X86/change-unsafe-fp-math.ll b/test/CodeGen/X86/change-unsafe-fp-math.ll
new file mode 100644
index 000000000000..33a7ec9bfc79
--- /dev/null
+++ b/test/CodeGen/X86/change-unsafe-fp-math.ll
@@ -0,0 +1,56 @@
+; Check that we can enable/disable UnsafeFPMath via function attributes.  An
+; attribute on one function should not magically apply to the next one.
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The div in these functions should be converted to a mul when unsafe-fp-math
+; is enabled.
+
+; CHECK-LABEL: unsafe_fp_math_default0:
+define double @unsafe_fp_math_default0(double %x) {
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_off:
+define double @unsafe_fp_math_off(double %x) #0 {
+; SAFE:      divsd
+; UNSAFE:    divsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default1:
+define double @unsafe_fp_math_default1(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_on:
+define double @unsafe_fp_math_on(double %x) #1 {
+; SAFE:      mulsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default2:
+define double @unsafe_fp_math_default2(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index d24f27ddf22c..5d05c699f431 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -281,4 +281,54 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
 ; CHECK: setne
 ; CHECK: testl
 ; CHECK: setne
-}
-\ No newline at end of file
+}
+
+define i32 @test21(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test21
+; CHECK: shrq $41, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
+
+; AND-to-SHR transformation is enabled for eq/ne condition codes only.
+define i32 @test22(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ult i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test22
+; CHECK-NOT: shrq $41
+; CHECK: retq
+}
+
+define i32 @test23(i64 %val) {
+  %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test23
+; CHECK: testq $-1048576, %rdi
+; CHECK: setne %al
+; CHECK: retq
+}
+
+define i32 @test24(i64 %val) {
+  %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test24
+; CHECK: shlq $16, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll
index ee1f7bb5295b..20ce932a184b 100644
--- a/test/CodeGen/X86/cpus.ll
+++ b/test/CodeGen/X86/cpus.ll
@@ -33,3 +33,4 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 13448a13ab4c..8c12e7148aa7 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -404,6 +404,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
 define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v16i8_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $15, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movb (%rdi,%rax), %al
@@ -411,6 +412,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
 ;
 ; AVX-LABEL: extractelement_v16i8_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    movb (%rdi,%rax), %al
@@ -426,6 +428,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $31, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movq %rsp, %rax
@@ -440,6 +443,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $31, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movq %rsp, %rax
 ; AVX-NEXT:    movb (%rdi,%rax), %al
@@ -454,12 +458,14 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v8i16_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $7, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v8i16_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $7, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX-NEXT:    retq
@@ -474,6 +480,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $15, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movzwl (%rsp,%rdi,2), %eax
@@ -487,6 +494,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX-NEXT:    movq %rbp, %rsp
@@ -500,12 +508,14 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v4i32_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v4i32_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; AVX-NEXT:    retq
@@ -520,6 +530,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $7, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movl (%rsp,%rdi,4), %eax
@@ -533,6 +544,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $7, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    movl (%rsp,%rdi,4), %eax
 ; AVX1-NEXT:    movq %rbp, %rsp
@@ -554,12 +566,14 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v2i64_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -24(%rsp,%rdi,8), %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v2i64_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movq -24(%rsp,%rdi,8), %rax
 ; AVX-NEXT:    retq
@@ -574,6 +588,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movq (%rsp,%rdi,8), %rax
@@ -587,6 +602,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movq (%rsp,%rdi,8), %rax
 ; AVX-NEXT:    movq %rbp, %rsp
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 946516c8a46d..c418e67ecb67 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -16,11 +16,11 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 ; CHECK-NEXT: movl 20(%esp), %edx
 ; CHECK-NEXT: paddd (%edx), %xmm0
 ; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl (%ecx,%edx), %esi
-; CHECK-NEXT: movl 12(%ecx,%edx), %edi
-; CHECK-NEXT: movl 8(%ecx,%edx), %ebx
-; CHECK-NEXT: movl 4(%ecx,%edx), %edx
+; CHECK-NEXT: movl (%edx), %esi
+; CHECK-NEXT: movl 12(%edx), %edi
+; CHECK-NEXT: movl 8(%edx), %ebx
+; CHECK-NEXT: movl 4(%edx), %edx
+; CHECK-NEXT: shll	$4, %ecx
 ; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
 ; CHECK-NEXT: movl %edx, (%eax,%ecx)
 ; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index 1fa752774251..7b1926da245c 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -68,9 +68,10 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
 
 define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) {
 ; X32-LABEL: PR23476:
+; X32: andl $7, %eax
 ; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    movsd %xmm0, (%ecx)
   %ext = extractelement <5 x i64> %in, i32 %index
   store i64 %ext, i64* %out, align 8
   ret void
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index 39f5242a8477..81351511374c 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -319,7 +319,7 @@ liveins:
   - { reg: '%rsi' }
 # CHECK:  bb.0.entry:
 # CHECK:  %rbx = MOV64rr %rdx
-# CHECK-NEXT:  %rdi = FAULTING_LOAD_OP %bb.3.is_null, 260, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT:  %rdi = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
 
 body:             |
   bb.0.entry:
diff --git a/test/CodeGen/X86/lzcnt-zext-cmp.ll b/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 6f4cb84a2b9c..c69dbf573f46 100644
--- a/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -3,6 +3,8 @@
 ; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt))
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
 
 ; Test one 32-bit input, output is 32-bit, no transformations expected.
 define i32 @test_zext_cmp0(i32 %a) {
diff --git a/test/CodeGen/X86/peephole.mir b/test/CodeGen/X86/peephole.mir
new file mode 100644
index 000000000000..6391836e9ca2
--- /dev/null
+++ b/test/CodeGen/X86/peephole.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+  define void @func() { ret void }
+...
+---
+# Check that instructions with MI.isBitcast() are only replaced by COPY if there
+# are no SUBREG_TO_REG users.
+# CHECK-LABEL: name: func
+name: func
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: fr32 }
+  - { id: 2, class: gr32 }
+
+  - { id: 3, class: gr32 }
+  - { id: 4, class: fr32 }
+  - { id: 5, class: gr32 }
+  - { id: 6, class: gr64 }
+
+body: |
+  bb.0:
+    ; CHECK: %1 = VMOVDI2SSrr %0
+    ; CHECK: %7 = COPY %0
+    ; CHECK: NOOP implicit %7
+    %0 = MOV32ri 42
+    %1 = VMOVDI2SSrr %0
+    %2 = MOVSS2DIrr %1
+    NOOP implicit %2
+
+    ; CHECK: %4 = VMOVDI2SSrr %3
+    ; CHECK-NOT: COPY
+    ; CHECK: %5 = MOVSS2DIrr %4
+    ; CHECK: %6 = SUBREG_TO_REG %5, 0
+    ; CHECK: NOOP implicit %6
+    %3 = MOV32ri 42
+    %4 = VMOVDI2SSrr %3
+    %5 = MOVSS2DIrr %4
+    %6 = SUBREG_TO_REG %5, 0, %subreg.sub_32bit
+    NOOP implicit %6
+...
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll
deleted file mode 100644
index 82928521ac2b..000000000000
--- a/test/CodeGen/X86/slow-div.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
-
-define i32 @div32(i32 %a, i32 %b) {
-entry:
-; DIV32-LABEL: div32:
-; DIV32: orl %{{.*}}, [[REG:%[a-z]+]]
-; DIV32: testl $-256, [[REG]]
-; DIV32: divb
-; DIV64-LABEL: div32:
-; DIV64-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
-define i64 @div64(i64 %a, i64 %b) {
-entry:
-; DIV32-LABEL: div64:
-; DIV32-NOT: divw
-; DIV64-LABEL: div64:
-; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
-; DIV64: testq   $-65536, [[REG]]
-; DIV64: divw
-  %div = sdiv i64 %a, %b
-  ret i64 %div
-}
-
-; Verify that no extra code is generated when optimizing for size.
-
-define i32 @div32_optsize(i32 %a, i32 %b) optsize {
-; DIV32-LABEL: div32_optsize:
-; DIV32-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
-define i32 @div32_minsize(i32 %a, i32 %b) minsize {
-; DIV32-LABEL: div32_minsize:
-; DIV32-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 41e9a95bcdd8..8251eb324a77 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -46,6 +46,7 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2        2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3        2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver1        2>&1 | FileCheck %s --check-prefix=FAST
 
 ; Other chips with slow unaligned memory accesses
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 4af9758f122d..972a33f13cd0 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1257,15 +1257,12 @@ define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
 define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
 ; X32-LABEL: test_mm_cvtsi32_sd:
 ; X32:       # BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    cvtsi2sdl %eax, %xmm1
-; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtsi32_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2sdl %edi, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtsi2sdl %edi, %xmm0
 ; X64-NEXT:    retq
   %cvt = sitofp i32 %a1 to double
   %res = insertelement <2 x double> %a0, double %cvt, i32 0
@@ -1293,14 +1290,12 @@ define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
 define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_cvtss_sd:
 ; X32:       # BB#0:
-; X32-NEXT:    cvtss2sd %xmm1, %xmm1
-; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    cvtss2sd %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtss_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtss2sd %xmm1, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtss2sd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %ext = extractelement <4 x float> %a1, i32 0
   %cvt = fpext float %ext to double
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 85c7875d923b..1dc8b7abd207 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -12,6 +12,7 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl $76, (%esp,%eax,4)
 ; X32-NEXT:    movl (%esp), %eax
@@ -21,9 +22,10 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t0:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl $76, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl $76, -24(%rsp,%rdi,4)
 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    retq
   %t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7
@@ -38,6 +40,7 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movl $76, %ecx
 ; X32-NEXT:    pinsrd $0, %ecx, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%esp)
@@ -48,11 +51,12 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t1:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movl $76, %eax
 ; X64-NEXT:    pinsrd $0, %eax, %xmm0
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl -24(%rsp,%rax,4), %eax
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; X64-NEXT:    retq
   %t13 = insertelement <4 x i32> %t8, i32 76, i32 0
   %t9 = extractelement <4 x i32> %t13, i32 %t7
@@ -66,6 +70,7 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movdqa %xmm0, (%esp)
 ; X32-NEXT:    pinsrd $0, (%esp,%eax,4), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
@@ -74,9 +79,10 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t2:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    pinsrd $0, -24(%rsp,%rax,4), %xmm0
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    pinsrd $0, -24(%rsp,%rdi,4), %xmm0
 ; X64-NEXT:    retq
   %t9 = extractelement <4 x i32> %t8, i32 %t7
   %t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0
@@ -90,6 +96,7 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movss %xmm0, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
@@ -99,9 +106,10 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t3:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movss %xmm0, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movss %xmm0, -24(%rsp,%rdi,4)
 ; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    retq
   %t9 = extractelement <4 x i32> %t8, i32 0
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index c847ac983003..82627c54e663 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -10,6 +10,7 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
 ; X32-NEXT:    andl $-32, %esp
 ; X32-NEXT:    subl $64, %esp
 ; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    andl $7, %eax
 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
@@ -25,10 +26,11 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
 ; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    andq $-32, %rsp
 ; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movaps %xmm0, (%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000
+; X64-NEXT:    andl $7, %edi
+; X64-NEXT:    movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
 ; X64-NEXT:    movaps (%rsp), %xmm0
 ; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
 ; X64-NEXT:    movq %rbp, %rsp
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index d612e7eb10d3..4074b6d32353 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -11,10 +11,11 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl 12(%ebp), %ecx
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    movl 8(%ebp), %ecx
 ; X32-NEXT:    movaps %xmm0, (%esp)
-; X32-NEXT:    movl %eax, (%esp,%ecx,4)
+; X32-NEXT:    movl %ecx, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -22,9 +23,10 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
 ;
 ; X64-LABEL: var_insert:
 ; X64:       # BB#0: # %entry
+; X64-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %esi, %rax
-; X64-NEXT:    movl %edi, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %esi
+; X64-NEXT:    movl %edi, -24(%rsp,%rsi,4)
 ; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    retq
 entry:
@@ -40,6 +42,7 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
 ; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl (%esp,%eax,4), %eax
 ; X32-NEXT:    movl %ebp, %esp
@@ -48,9 +51,10 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
 ;
 ; X64-LABEL: var_extract:
 ; X64:       # BB#0: # %entry
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl -24(%rsp,%rax,4), %eax
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; X64-NEXT:    retq
 entry:
   %tmp3 = extractelement <4 x i32> %x, i32 %idx
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 6a81cdc490fe..923af1216d05 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4818,3 +4818,63 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
  store <8 x float> %4, <8 x float>* %3, align 32
  ret void
 }
+
+define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2sdl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2ssl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2sdq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 440faa689fb8..9f0d4a7d7264 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -7,7 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -89,6 +90,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -193,6 +199,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -339,6 +350,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -515,6 +539,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -624,6 +656,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
@@ -669,6 +706,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -712,6 +755,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -907,6 +956,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1033,6 +1091,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
@@ -1114,6 +1177,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1207,6 +1275,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1367,6 +1447,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -1480,6 +1567,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1514,6 +1606,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrad $5, %xmm0
@@ -1543,6 +1640,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psraw $3, %xmm0
@@ -1586,6 +1688,15 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 79902acfec24..aee2857157b6 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
 ;
 ; Variable Shifts
 ;
@@ -74,6 +77,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -135,6 +143,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -228,6 +241,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -375,6 +401,42 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -435,6 +497,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -476,6 +543,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -517,6 +590,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -662,6 +741,44 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -724,6 +841,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -769,6 +891,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -844,6 +971,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -981,6 +1120,42 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -1033,6 +1208,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -1068,6 +1248,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrad $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -1103,6 +1288,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -1160,6 +1350,15 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 2c9e433cfb2c..6cc98b5f3eeb 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
 ;
 ; Variable Shifts
 ;
@@ -99,399 +100,36 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -590,399 +228,36 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -1080,252 +355,36 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm3[0],zmm0[1],zmm3[1],zmm0[2],zmm3[2],zmm0[3],zmm3[3],zmm0[4],zmm3[4],zmm0[5],zmm3[5],zmm0[6],zmm3[6],zmm0[7],zmm3[7],zmm0[16],zmm3[16],zmm0[17],zmm3[17],zmm0[18],zmm3[18],zmm0[19],zmm3[19],zmm0[20],zmm3[20],zmm0[21],zmm3[21],zmm0[22],zmm3[22],zmm0[23],zmm3[23],zmm0[32],zmm3[32],zmm0[33],zmm3[33],zmm0[34],zmm3[34],zmm0[35],zmm3[35],zmm0[36],zmm3[36],zmm0[37],zmm3[37],zmm0[38],zmm3[38],zmm0[39],zmm3[39],zmm0[48],zmm3[48],zmm0[49],zmm3[49],zmm0[50],zmm3[50],zmm0[51],zmm3[51],zmm0[52],zmm3[52],zmm0[53],zmm3[53],zmm0[54],zmm3[54],zmm0[55],zmm3[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index a7e1a531b659..9b8c0def4558 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -7,6 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -65,6 +67,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -162,6 +169,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -308,6 +320,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -433,6 +458,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $5, %xmm1
@@ -492,6 +525,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
@@ -533,6 +571,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -576,6 +620,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -709,6 +759,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -798,6 +857,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -872,6 +936,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -965,6 +1034,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1071,6 +1152,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -1131,6 +1219,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlq $7, %xmm0
@@ -1160,6 +1253,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrld $5, %xmm0
@@ -1189,6 +1287,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
@@ -1223,6 +1326,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 25667e7d1661..58bb8f3e6ec0 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
 ;
 ; Variable Shifts
 ;
@@ -51,6 +54,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -112,6 +120,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -205,6 +218,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -307,6 +333,30 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -346,6 +396,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = lshr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -387,6 +442,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -428,6 +489,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -532,6 +599,32 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -579,6 +672,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -624,6 +722,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -699,6 +802,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -795,6 +910,30 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -834,6 +973,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -869,6 +1013,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrld $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -904,6 +1053,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -947,6 +1101,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 3da8f9437e57..905445f30162 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -79,399 +79,21 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -553,399 +175,21 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -1026,252 +270,21 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 8706078b40c9..32334420f8b2 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -7,6 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -63,6 +65,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -128,6 +135,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pslld $23, %xmm1
@@ -263,6 +275,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -383,6 +408,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $5, %xmm1
@@ -441,6 +474,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
@@ -482,6 +520,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -525,6 +569,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -651,6 +701,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -737,6 +796,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -792,6 +856,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
@@ -836,6 +905,16 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
@@ -925,6 +1004,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -984,6 +1070,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllq $7, %xmm0
@@ -1013,6 +1104,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pslld $5, %xmm0
@@ -1042,6 +1138,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $3, %xmm0
@@ -1074,6 +1175,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index a1ef2791c1b0..104fa089c744 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -5,6 +5,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 
 ;
 ; Variable Shifts
@@ -49,6 +51,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -93,6 +100,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -180,6 +192,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -271,6 +296,29 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -310,6 +358,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = shl <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -351,6 +404,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -392,6 +451,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -487,6 +552,31 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -531,6 +621,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -566,6 +661,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -609,6 +709,16 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -698,6 +808,29 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -737,6 +870,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -772,6 +910,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpslld $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -807,6 +950,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -849,6 +997,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index b9c9b56427f1..180d6f3a3b03 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -76,399 +76,19 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -547,399 +167,19 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -1013,252 +253,19 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
new file mode 100644
index 000000000000..defc3e918b24
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
+
+;expand 128 -> 256 include <4 x float> <2 x double>
+define <8 x float> @expand(<4 x float> %a) {
+; SKX-LABEL: expand:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $5, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
+   ret <8 x float> %res
+}
+
+define <8 x float> @expand1(<4 x float> %a ) {
+; SKX-LABEL: expand1:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-86, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand1:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
+; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+   ret <8 x float> %res
+}
+
+;Expand 128 -> 256 test <2 x double> -> <4 x double>
+define <4 x double> @expand2(<2 x double> %a) {
+; SKX-LABEL: expand2:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand2:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
+   ret <4 x double> %res
+}
+
+;expand 128 -> 256 include case <4 x i32> <8 x i32>
+define <8 x i32> @expand3(<4 x i32> %a ) {
+; SKX-LABEL: expand3:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
+   ret <8 x i32> %res
+}
+
+;expand 128 -> 256 include case <2 x i64> <4 x i64>
+define <4 x i64> @expand4(<2 x i64> %a ) {
+; SKX-LABEL: expand4:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand4:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
+   ret <4 x i64> %res
+}
+
+;Negative test for 128-> 256
+define <8 x float> @expand5(<4 x float> %a ) {
+; SKX-LABEL: expand5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vbroadcastss %xmm0, %ymm0
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand5:
+; KNL:       # BB#0:
+; KNL-NEXT:    vbroadcastss %xmm0, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+   ret <8 x float> %res
+}
+
+;expand 256 -> 512 include <8 x float> <16 x float>
+define <8 x float> @expand6(<4 x float> %a ) {
+; SKX-LABEL: expand6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand6:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   ret <8 x float> %res
+}
+
+define <16 x float> @expand7(<8 x float> %a) {
+; SKX-LABEL: expand7:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $1285, %ax # imm = 0x505
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand7:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $1285, %ax # imm = 0x505
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand8(<8 x float> %a ) {
+; SKX-LABEL: expand8:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand8:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x float> %res
+}
+
+;expand 256 -> 512 include <4 x double> <8 x double>
+define <8 x double> @expand9(<4 x double> %a) {
+; SKX-LABEL: expand9:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand9:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x double> %res
+}
+
+define <16 x i32> @expand10(<8 x i32> %a ) {
+; SKX-LABEL: expand10:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand10:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x i32> %res
+}
+
+define <8 x i64> @expand11(<4 x i64> %a) {
+; SKX-LABEL: expand11:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand11:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x i64> %res
+}
+
+;Negative test for 256-> 512
+define <16 x float> @expand12(<8 x float> %a) {
+; SKX-LABEL: expand12:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; SKX-NEXT:    vxorps %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand12:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand13(<8 x float> %a ) {
+; SKX-LABEL: expand13:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand13:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   ret <16 x float> %res
+}
+
+; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
+
+define <8 x float> @expand14(<4 x float> %a) {
+; SKX-LABEL: expand14:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $20, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand14:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
+
+;Negative test.
+define <8 x float> @expand15(<4 x float> %a) {
+; SKX-LABEL: expand15:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
+; SKX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
+; SKX-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand15:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 23e40a6572af..b79df1facfa1 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -91,6 +91,20 @@ define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %res1
 }
 
+define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: combine_vpermil2ps_02zu:
+; X32:       # BB#0:
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_vpermil2ps_02zu:
+; X64:       # BB#0:
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT:    retq
+  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>, i8 0)
+  ret <4 x float> %res0
+}
+
 define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
 ; X32-LABEL: combine_vpermil2ps256_identity:
 ; X32:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index d130e7ff00b2..70b7fb16fc25 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -12,6 +12,8 @@
 define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
 ; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $1, %esi
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -19,6 +21,8 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
 ;
 ; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $1, %esi
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -33,9 +37,11 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
 define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
 ; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movslq %edi, %rax
+; SSE-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %esi, %rcx
+; SSE-NEXT:    andl $1, %esi
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -43,9 +49,11 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 ;
 ; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %esi, %rcx
+; AVX-NEXT:    andl $1, %esi
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -60,11 +68,15 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq %edi, %rax
-; SSE2-NEXT:    movslq %esi, %rsi
-; SSE2-NEXT:    movslq %edx, %rdx
+; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq %ecx, %rcx
+; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -76,11 +88,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq %edi, %rax
-; SSSE3-NEXT:    movslq %esi, %rsi
-; SSSE3-NEXT:    movslq %edx, %rdx
+; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT:    andl $3, %edi
+; SSSE3-NEXT:    andl $3, %esi
+; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq %ecx, %rcx
+; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -92,11 +108,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq %edi, %rax
-; SSE41-NEXT:    movslq %esi, %rsi
-; SSE41-NEXT:    movslq %edx, %rdx
+; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT:    andl $3, %edi
+; SSE41-NEXT:    andl $3, %esi
+; SSE41-NEXT:    andl $3, %edx
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq %ecx, %rcx
+; SSE41-NEXT:    andl $3, %ecx
 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -105,11 +125,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
-; AVX-NEXT:    movslq %esi, %rsi
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
+; AVX-NEXT:    andl $3, %esi
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -129,11 +153,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq %edi, %rax
-; SSE2-NEXT:    movslq %esi, %rsi
-; SSE2-NEXT:    movslq %edx, %rdx
+; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq %ecx, %rcx
+; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -145,11 +173,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq %edi, %rax
-; SSSE3-NEXT:    movslq %esi, %rsi
-; SSSE3-NEXT:    movslq %edx, %rdx
+; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT:    andl $3, %edi
+; SSSE3-NEXT:    andl $3, %esi
+; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq %ecx, %rcx
+; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -161,11 +193,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq %edi, %rax
-; SSE41-NEXT:    movslq %esi, %rsi
-; SSE41-NEXT:    movslq %edx, %rdx
+; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT:    andl $3, %edi
+; SSE41-NEXT:    andl $3, %esi
+; SSE41-NEXT:    andl $3, %edx
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq %ecx, %rcx
+; SSE41-NEXT:    andl $3, %ecx
 ; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rsi,4), %xmm0
 ; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -174,11 +210,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
-; AVX-NEXT:    movslq %esi, %rsi
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
+; AVX-NEXT:    andl $3, %esi
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -204,34 +244,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT:    movswq %di, %rax
-; SSE2-NEXT:    movswq %si, %rsi
-; SSE2-NEXT:    movswq %dx, %rdx
-; SSE2-NEXT:    movswq %cx, %r10
-; SSE2-NEXT:    movswq %r8w, %r11
+; SSE2-NEXT:    andl $7, %edi
+; SSE2-NEXT:    andl $7, %esi
+; SSE2-NEXT:    andl $7, %edx
+; SSE2-NEXT:    andl $7, %ecx
+; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r9w, %r8
-; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE2-NEXT:    andl $7, %r9d
+; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    andl $7, %r10d
+; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $7, %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movd %r10d, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSE2-NEXT:    movd %edx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%r11,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %edx
+; SSE2-NEXT:    movd %edx, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movd %ecx, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-NEXT:    movd %esi, %xmm1
-; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -246,34 +288,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT:    movswq %di, %rax
-; SSSE3-NEXT:    movswq %si, %rsi
-; SSSE3-NEXT:    movswq %dx, %rdx
-; SSSE3-NEXT:    movswq %cx, %r10
-; SSSE3-NEXT:    movswq %r8w, %r11
+; SSSE3-NEXT:    andl $7, %edi
+; SSSE3-NEXT:    andl $7, %esi
+; SSSE3-NEXT:    andl $7, %edx
+; SSSE3-NEXT:    andl $7, %ecx
+; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r9w, %r8
-; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSSE3-NEXT:    andl $7, %r9d
+; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    andl $7, %r10d
+; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $7, %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movd %r10d, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSSE3-NEXT:    movd %edx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%r11,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %edx
+; SSSE3-NEXT:    movd %edx, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    movd %edi, %xmm1
+; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    movd %ecx, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSSE3-NEXT:    movd %esi, %xmm1
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -282,68 +326,66 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ;
 ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movswq %di, %rax
-; SSE41-NEXT:    movswq %si, %rbx
-; SSE41-NEXT:    movswq %dx, %r11
-; SSE41-NEXT:    movswq %cx, %r10
-; SSE41-NEXT:    movswq %r8w, %rdi
+; SSE41-NEXT:    andl $7, %edi
+; SSE41-NEXT:    andl $7, %esi
+; SSE41-NEXT:    andl $7, %edx
+; SSE41-NEXT:    andl $7, %ecx
+; SSE41-NEXT:    andl $7, %r8d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r9w, %rcx
-; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT:    movzwl -16(%rsp,%rdx,2), %edx
-; SSE41-NEXT:    movzwl -16(%rsp,%rsi,2), %esi
-; SSE41-NEXT:    movzwl -16(%rsp,%rax,2), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrw $1, -16(%rsp,%rbx,2), %xmm0
-; SSE41-NEXT:    pinsrw $2, -16(%rsp,%r11,2), %xmm0
-; SSE41-NEXT:    pinsrw $3, -16(%rsp,%r10,2), %xmm0
-; SSE41-NEXT:    pinsrw $4, -16(%rsp,%rdi,2), %xmm0
-; SSE41-NEXT:    pinsrw $5, -16(%rsp,%rcx,2), %xmm0
-; SSE41-NEXT:    pinsrw $6, %edx, %xmm0
-; SSE41-NEXT:    pinsrw $7, %esi, %xmm0
-; SSE41-NEXT:    popq %rbx
+; SSE41-NEXT:    andl $7, %r9d
+; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT:    andl $7, %r10d
+; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $7, %eax
+; SSE41-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSE41-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE41-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
+; SSE41-NEXT:    pinsrw $2, -24(%rsp,%rdx,2), %xmm0
+; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
+; SSE41-NEXT:    pinsrw $4, -24(%rsp,%r8,2), %xmm0
+; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
+; SSE41-NEXT:    pinsrw $6, %r10d, %xmm0
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; AVX-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT:    movswq %di, %r10
-; AVX-NEXT:    movswq %si, %r11
-; AVX-NEXT:    movswq %dx, %r14
-; AVX-NEXT:    movswq %cx, %rcx
-; AVX-NEXT:    movswq %r8w, %rdi
+; AVX-NEXT:    andl $7, %edi
+; AVX-NEXT:    andl $7, %esi
+; AVX-NEXT:    andl $7, %edx
+; AVX-NEXT:    andl $7, %ecx
+; AVX-NEXT:    andl $7, %r8d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movswq %r9w, %rax
-; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; AVX-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
-; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %ebx
-; AVX-NEXT:    vmovd %ebx, %xmm0
-; AVX-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
+; AVX-NEXT:    andl $7, %r9d
+; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT:    andl $7, %r10d
+; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $7, %eax
+; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; AVX-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r14
+; AVX-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $6, %r10d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0 = extractelement <8 x i16> %x, i16 %i0
   %x1 = extractelement <8 x i16> %x, i16 %i1
@@ -374,54 +416,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    andl $15, %r10d
 ; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
 ; SSE2-NEXT:    movzbl (%r10,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm15
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm8
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
-; SSE2-NEXT:    movsbq %dl, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movsbq %dil, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %edi
+; SSE2-NEXT:    movzbl (%rdi,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm11
-; SSE2-NEXT:    movsbq %r8b, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %r8d
+; SSE2-NEXT:    movzbl (%r8,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm12
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
-; SSE2-NEXT:    movsbq %cl, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl (%rcx,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    movsbq %sil, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %esi
+; SSE2-NEXT:    movzbl (%rsi,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movsbq %r9b, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %r9d
+; SSE2-NEXT:    movzbl (%r9,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -449,54 +501,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    andl $15, %r10d
 ; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
 ; SSSE3-NEXT:    movzbl (%r10,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm15
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm8
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm9
-; SSSE3-NEXT:    movsbq %dl, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm10
-; SSSE3-NEXT:    movsbq %dil, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %edi
+; SSSE3-NEXT:    movzbl (%rdi,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm11
-; SSSE3-NEXT:    movsbq %r8b, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %r8d
+; SSSE3-NEXT:    movzbl (%r8,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm7
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm12
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm13
-; SSSE3-NEXT:    movsbq %cl, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl (%rcx,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm6
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    movsbq %sil, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %esi
+; SSSE3-NEXT:    movzbl (%rsi,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm5
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movsbq %r9b, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %r9d
+; SSSE3-NEXT:    movzbl (%r9,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -520,7 +582,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    pushq %rbp
 ; SSE41-NEXT:    pushq %r15
 ; SSE41-NEXT:    pushq %r14
-; SSE41-NEXT:    pushq %r13
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -529,54 +590,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movsbq %dil, %r15
-; SSE41-NEXT:    movsbq %sil, %r14
-; SSE41-NEXT:    movsbq %dl, %r11
-; SSE41-NEXT:    movsbq %cl, %r10
-; SSE41-NEXT:    movsbq %r8b, %r8
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    andl $15, %esi
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    andl $15, %r8d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movsbq %r9b, %r9
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbx
+; SSE41-NEXT:    andl $15, %r9d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT:    andl $15, %r10d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE41-NEXT:    andl $15, %r11d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE41-NEXT:    andl $15, %r14d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE41-NEXT:    andl $15, %r15d
 ; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT:    movzbl (%r15,%rax), %ecx
-; SSE41-NEXT:    movd %ecx, %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT:    pinsrb $1, (%r14,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
-; SSE41-NEXT:    pinsrb $2, (%r11,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
-; SSE41-NEXT:    pinsrb $3, (%r10,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    movzbl (%rdi,%rax), %edi
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE41-NEXT:    andl $15, %r12d
+; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; SSE41-NEXT:    andl $15, %esi
+; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
 ; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; SSE41-NEXT:    andl $15, %ebx
 ; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT:    movzbl (%r12,%rax), %esi
-; SSE41-NEXT:    movzbl (%r13,%rax), %edi
-; SSE41-NEXT:    movzbl (%rbp,%rax), %ebp
-; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
-; SSE41-NEXT:    movzbl (%r15,%rax), %r8d
-; SSE41-NEXT:    movzbl (%r14,%rax), %r9d
-; SSE41-NEXT:    movzbl (%r11,%rax), %r11d
-; SSE41-NEXT:    movzbl (%r10,%rax), %r10d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    movzbl (%r10,%rax), %r8d
+; SSE41-NEXT:    movzbl (%r11,%rax), %r9d
+; SSE41-NEXT:    movzbl (%r14,%rax), %r10d
+; SSE41-NEXT:    movzbl (%r15,%rax), %r11d
+; SSE41-NEXT:    movzbl (%r12,%rax), %ebp
+; SSE41-NEXT:    movzbl (%rsi,%rax), %esi
+; SSE41-NEXT:    movzbl (%rdx,%rax), %edx
 ; SSE41-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE41-NEXT:    movzbl (%rdx,%rax), %eax
-; SSE41-NEXT:    pinsrb $6, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $7, %edi, %xmm0
-; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $9, %ebx, %xmm0
-; SSE41-NEXT:    pinsrb $10, %r8d, %xmm0
-; SSE41-NEXT:    pinsrb $11, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $12, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $13, %r10d, %xmm0
-; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
+; SSE41-NEXT:    movzbl (%rdi,%rax), %eax
+; SSE41-NEXT:    pinsrb $6, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $7, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r10d, %xmm0
+; SSE41-NEXT:    pinsrb $9, %r11d, %xmm0
+; SSE41-NEXT:    pinsrb $10, %ebp, %xmm0
+; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $12, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $14, %ebx, %xmm0
 ; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
-; SSE41-NEXT:    popq %r13
 ; SSE41-NEXT:    popq %r14
 ; SSE41-NEXT:    popq %r15
 ; SSE41-NEXT:    popq %rbp
@@ -587,7 +657,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    pushq %rbp
 ; AVX-NEXT:    pushq %r15
 ; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %r13
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -596,54 +665,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT:    movsbq %dil, %r10
-; AVX-NEXT:    movsbq %sil, %r11
-; AVX-NEXT:    movsbq %dl, %r14
-; AVX-NEXT:    movsbq %cl, %r15
-; AVX-NEXT:    movsbq %r8b, %r8
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    andl $15, %esi
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    andl $15, %r8d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movsbq %r9b, %r9
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rdi
-; AVX-NEXT:    movzbl (%r10,%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT:    vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
-; AVX-NEXT:    vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
-; AVX-NEXT:    vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
-; AVX-NEXT:    vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT:    vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movzbl (%r12,%rdi), %edx
-; AVX-NEXT:    movzbl (%r13,%rdi), %ebx
-; AVX-NEXT:    movzbl (%rbp,%rdi), %ebp
-; AVX-NEXT:    movzbl (%rcx,%rdi), %ecx
-; AVX-NEXT:    movzbl (%r10,%rdi), %eax
-; AVX-NEXT:    movzbl (%r11,%rdi), %r9d
-; AVX-NEXT:    movzbl (%r14,%rdi), %r10d
-; AVX-NEXT:    movzbl (%r15,%rdi), %r11d
-; AVX-NEXT:    movzbl (%r8,%rdi), %r8d
-; AVX-NEXT:    movzbl (%rsi,%rdi), %esi
-; AVX-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %esi, %xmm0, %xmm0
+; AVX-NEXT:    andl $15, %r9d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT:    andl $15, %r10d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX-NEXT:    andl $15, %r11d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX-NEXT:    andl $15, %r14d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT:    andl $15, %r15d
+; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movzbl (%rdi,%rax), %edi
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX-NEXT:    andl $15, %r12d
+; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; AVX-NEXT:    andl $15, %esi
+; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX-NEXT:    andl $15, %ebx
+; AVX-NEXT:    vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    movzbl (%r10,%rax), %r8d
+; AVX-NEXT:    movzbl (%r11,%rax), %r9d
+; AVX-NEXT:    movzbl (%r14,%rax), %r10d
+; AVX-NEXT:    movzbl (%r15,%rax), %r11d
+; AVX-NEXT:    movzbl (%r12,%rax), %ebp
+; AVX-NEXT:    movzbl (%rsi,%rax), %esi
+; AVX-NEXT:    movzbl (%rdx,%rax), %edx
+; AVX-NEXT:    movzbl (%rcx,%rax), %ecx
+; AVX-NEXT:    movzbl (%rbx,%rax), %ebx
+; AVX-NEXT:    movzbl (%rdi,%rax), %eax
+; AVX-NEXT:    vpinsrb $6, %r8d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $7, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $11, %esi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $14, %ebx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
-; AVX-NEXT:    popq %r13
 ; AVX-NEXT:    popq %r14
 ; AVX-NEXT:    popq %r15
 ; AVX-NEXT:    popq %rbp
@@ -690,11 +768,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
 ; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq (%rdi), %rax
+; SSE2-NEXT:    movl (%rdi), %eax
+; SSE2-NEXT:    movl 4(%rdi), %ecx
+; SSE2-NEXT:    andl $3, %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq 4(%rdi), %rcx
-; SSE2-NEXT:    movslq 8(%rdi), %rdx
-; SSE2-NEXT:    movslq 12(%rdi), %rsi
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    movl 8(%rdi), %edx
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    movl 12(%rdi), %esi
+; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -706,11 +788,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq (%rdi), %rax
+; SSSE3-NEXT:    movl (%rdi), %eax
+; SSSE3-NEXT:    movl 4(%rdi), %ecx
+; SSSE3-NEXT:    andl $3, %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq 4(%rdi), %rcx
-; SSSE3-NEXT:    movslq 8(%rdi), %rdx
-; SSSE3-NEXT:    movslq 12(%rdi), %rsi
+; SSSE3-NEXT:    andl $3, %ecx
+; SSSE3-NEXT:    movl 8(%rdi), %edx
+; SSSE3-NEXT:    andl $3, %edx
+; SSSE3-NEXT:    movl 12(%rdi), %esi
+; SSSE3-NEXT:    andl $3, %esi
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -722,11 +808,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq (%rdi), %rax
+; SSE41-NEXT:    movl (%rdi), %eax
+; SSE41-NEXT:    movl 4(%rdi), %ecx
+; SSE41-NEXT:    andl $3, %eax
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq 4(%rdi), %rcx
-; SSE41-NEXT:    movslq 8(%rdi), %rdx
-; SSE41-NEXT:    movslq 12(%rdi), %rsi
+; SSE41-NEXT:    andl $3, %ecx
+; SSE41-NEXT:    movl 8(%rdi), %edx
+; SSE41-NEXT:    andl $3, %edx
+; SSE41-NEXT:    movl 12(%rdi), %esi
+; SSE41-NEXT:    andl $3, %esi
 ; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rcx,4), %xmm0
 ; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -735,11 +825,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq (%rdi), %rax
+; AVX-NEXT:    movl (%rdi), %eax
+; AVX-NEXT:    movl 4(%rdi), %ecx
+; AVX-NEXT:    andl $3, %eax
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq 4(%rdi), %rcx
-; AVX-NEXT:    movslq 8(%rdi), %rdx
-; AVX-NEXT:    movslq 12(%rdi), %rsi
+; AVX-NEXT:    andl $3, %ecx
+; AVX-NEXT:    movl 8(%rdi), %edx
+; AVX-NEXT:    andl $3, %edx
+; AVX-NEXT:    movl 12(%rdi), %esi
+; AVX-NEXT:    andl $3, %esi
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -767,55 +861,71 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
 ; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsbq (%rdi), %rcx
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movsbq 8(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm8
-; SSE2-NEXT:    movsbq 12(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm9
-; SSE2-NEXT:    movsbq 4(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    movsbq 14(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm10
-; SSE2-NEXT:    movsbq 6(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm5
-; SSE2-NEXT:    movsbq 10(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm11
-; SSE2-NEXT:    movsbq 2(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm7
-; SSE2-NEXT:    movsbq 15(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm12
-; SSE2-NEXT:    movsbq 7(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    movsbq 11(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm13
-; SSE2-NEXT:    movsbq 3(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm6
-; SSE2-NEXT:    movsbq 13(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm14
-; SSE2-NEXT:    movsbq 5(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    movsbq 9(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm15
-; SSE2-NEXT:    movsbq 1(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %eax
+; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzbl 8(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm8
+; SSE2-NEXT:    movzbl 12(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    movzbl 4(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movzbl 14(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl 6(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    movzbl 10(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm11
+; SSE2-NEXT:    movzbl 2(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    movzbl 15(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm12
+; SSE2-NEXT:    movzbl 7(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzbl 11(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm13
+; SSE2-NEXT:    movzbl 3(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    movzbl 13(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm14
+; SSE2-NEXT:    movzbl 5(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movzbl 9(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm15
+; SSE2-NEXT:    movzbl 1(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -836,55 +946,71 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ;
 ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsbq (%rdi), %rcx
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movsbq 8(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm8
-; SSSE3-NEXT:    movsbq 12(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm9
-; SSSE3-NEXT:    movsbq 4(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    movsbq 14(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm10
-; SSSE3-NEXT:    movsbq 6(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm5
-; SSSE3-NEXT:    movsbq 10(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm11
-; SSSE3-NEXT:    movsbq 2(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm7
-; SSSE3-NEXT:    movsbq 15(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm12
-; SSSE3-NEXT:    movsbq 7(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movsbq 11(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm13
-; SSSE3-NEXT:    movsbq 3(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm6
-; SSSE3-NEXT:    movsbq 13(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm14
-; SSSE3-NEXT:    movsbq 5(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm4
-; SSSE3-NEXT:    movsbq 9(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm15
-; SSSE3-NEXT:    movsbq 1(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %eax
+; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzbl 8(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm8
+; SSSE3-NEXT:    movzbl 12(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm9
+; SSSE3-NEXT:    movzbl 4(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    movzbl 14(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm10
+; SSSE3-NEXT:    movzbl 6(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm5
+; SSSE3-NEXT:    movzbl 10(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm11
+; SSSE3-NEXT:    movzbl 2(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm7
+; SSSE3-NEXT:    movzbl 15(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm12
+; SSSE3-NEXT:    movzbl 7(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzbl 11(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm13
+; SSSE3-NEXT:    movzbl 3(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm6
+; SSSE3-NEXT:    movzbl 13(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm14
+; SSSE3-NEXT:    movzbl 5(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
+; SSSE3-NEXT:    movzbl 9(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm15
+; SSSE3-NEXT:    movzbl 1(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -911,55 +1037,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSE41-NEXT:    pushq %r13
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
-; SSE41-NEXT:    movsbq (%rdi), %rax
+; SSE41-NEXT:    movzbl (%rdi), %r11d
+; SSE41-NEXT:    andl $15, %r11d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movsbq 1(%rdi), %r15
-; SSE41-NEXT:    movsbq 2(%rdi), %r8
-; SSE41-NEXT:    movsbq 3(%rdi), %r9
-; SSE41-NEXT:    movsbq 4(%rdi), %r10
-; SSE41-NEXT:    movsbq 5(%rdi), %r11
-; SSE41-NEXT:    movsbq 6(%rdi), %r14
-; SSE41-NEXT:    movsbq 7(%rdi), %r12
-; SSE41-NEXT:    movsbq 8(%rdi), %r13
-; SSE41-NEXT:    movsbq 9(%rdi), %rdx
-; SSE41-NEXT:    movsbq 10(%rdi), %rcx
-; SSE41-NEXT:    movsbq 11(%rdi), %rsi
-; SSE41-NEXT:    movsbq 12(%rdi), %rbx
+; SSE41-NEXT:    movzbl 1(%rdi), %r9d
+; SSE41-NEXT:    andl $15, %r9d
+; SSE41-NEXT:    movzbl 2(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT:    movzbl 3(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT:    movzbl 4(%rdi), %r14d
+; SSE41-NEXT:    andl $15, %r14d
+; SSE41-NEXT:    movzbl 5(%rdi), %r15d
+; SSE41-NEXT:    andl $15, %r15d
+; SSE41-NEXT:    movzbl 6(%rdi), %r12d
+; SSE41-NEXT:    andl $15, %r12d
+; SSE41-NEXT:    movzbl 7(%rdi), %r13d
+; SSE41-NEXT:    andl $15, %r13d
+; SSE41-NEXT:    movzbl 8(%rdi), %r8d
+; SSE41-NEXT:    andl $15, %r8d
+; SSE41-NEXT:    movzbl 9(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movzbl 10(%rdi), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    movzbl 11(%rdi), %edx
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    movzbl 12(%rdi), %esi
+; SSE41-NEXT:    andl $15, %esi
 ; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    movsbq 13(%rdi), %rax
-; SSE41-NEXT:    pinsrb $1, (%r15,%rbp), %xmm0
-; SSE41-NEXT:    movsbq 14(%rdi), %r15
-; SSE41-NEXT:    movsbq 15(%rdi), %rdi
-; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
-; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
-; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
-; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebx
+; SSE41-NEXT:    movzbl (%r11,%rbp), %ebx
+; SSE41-NEXT:    movd %ebx, %xmm0
+; SSE41-NEXT:    movzbl 13(%rdi), %r11d
+; SSE41-NEXT:    andl $15, %r11d
+; SSE41-NEXT:    pinsrb $1, (%r9,%rbp), %xmm0
+; SSE41-NEXT:    movzbl 14(%rdi), %ebx
+; SSE41-NEXT:    andl $15, %ebx
+; SSE41-NEXT:    movzbl 15(%rdi), %edi
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    movzbl (%rdi,%rbp), %r10d
+; SSE41-NEXT:    movzbl (%rbx,%rbp), %r9d
+; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
 ; SSE41-NEXT:    movzbl (%rsi,%rbp), %esi
-; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
 ; SSE41-NEXT:    movzbl (%rdx,%rbp), %edx
+; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
+; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
+; SSE41-NEXT:    movzbl (%r8,%rbp), %r8d
 ; SSE41-NEXT:    movzbl (%r13,%rbp), %r13d
 ; SSE41-NEXT:    movzbl (%r12,%rbp), %r12d
+; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
 ; SSE41-NEXT:    movzbl (%r14,%rbp), %r14d
-; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
-; SSE41-NEXT:    movzbl (%r10,%rbp), %r10d
-; SSE41-NEXT:    movzbl (%r9,%rbp), %r9d
-; SSE41-NEXT:    movzbl (%r8,%rbp), %ebp
+; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
+; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebp
 ; SSE41-NEXT:    pinsrb $2, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $3, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $4, %r10d, %xmm0
-; SSE41-NEXT:    pinsrb $5, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $6, %r14d, %xmm0
-; SSE41-NEXT:    pinsrb $7, %r12d, %xmm0
-; SSE41-NEXT:    pinsrb $8, %r13d, %xmm0
-; SSE41-NEXT:    pinsrb $9, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $3, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $4, %r14d, %xmm0
+; SSE41-NEXT:    pinsrb $5, %r15d, %xmm0
+; SSE41-NEXT:    pinsrb $6, %r12d, %xmm0
+; SSE41-NEXT:    pinsrb $7, %r13d, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $9, %eax, %xmm0
 ; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
-; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $12, %ebx, %xmm0
-; SSE41-NEXT:    pinsrb $13, %eax, %xmm0
-; SSE41-NEXT:    pinsrb $14, %r15d, %xmm0
-; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $11, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $12, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $13, %r11d, %xmm0
+; SSE41-NEXT:    pinsrb $14, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $15, %r10d, %xmm0
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -976,55 +1122,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; AVX-NEXT:    pushq %r13
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movsbq (%rdi), %rsi
+; AVX-NEXT:    movzbl (%rdi), %r11d
+; AVX-NEXT:    andl $15, %r11d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movsbq 1(%rdi), %r15
-; AVX-NEXT:    movsbq 2(%rdi), %r8
-; AVX-NEXT:    movsbq 3(%rdi), %r9
-; AVX-NEXT:    movsbq 4(%rdi), %r10
-; AVX-NEXT:    movsbq 5(%rdi), %r11
-; AVX-NEXT:    movsbq 6(%rdi), %r14
-; AVX-NEXT:    movsbq 7(%rdi), %r12
-; AVX-NEXT:    movsbq 8(%rdi), %r13
-; AVX-NEXT:    movsbq 9(%rdi), %rdx
-; AVX-NEXT:    movsbq 10(%rdi), %rax
-; AVX-NEXT:    movsbq 11(%rdi), %rcx
-; AVX-NEXT:    movsbq 12(%rdi), %rbx
+; AVX-NEXT:    movzbl 1(%rdi), %r9d
+; AVX-NEXT:    andl $15, %r9d
+; AVX-NEXT:    movzbl 2(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT:    movzbl 3(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT:    movzbl 4(%rdi), %r14d
+; AVX-NEXT:    andl $15, %r14d
+; AVX-NEXT:    movzbl 5(%rdi), %r15d
+; AVX-NEXT:    andl $15, %r15d
+; AVX-NEXT:    movzbl 6(%rdi), %r12d
+; AVX-NEXT:    andl $15, %r12d
+; AVX-NEXT:    movzbl 7(%rdi), %r13d
+; AVX-NEXT:    andl $15, %r13d
+; AVX-NEXT:    movzbl 8(%rdi), %r8d
+; AVX-NEXT:    andl $15, %r8d
+; AVX-NEXT:    movzbl 9(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movzbl 10(%rdi), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    movzbl 11(%rdi), %edx
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    movzbl 12(%rdi), %esi
+; AVX-NEXT:    andl $15, %esi
 ; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT:    movzbl (%r11,%rbp), %ebx
+; AVX-NEXT:    vmovd %ebx, %xmm0
+; AVX-NEXT:    movzbl 13(%rdi), %r11d
+; AVX-NEXT:    andl $15, %r11d
+; AVX-NEXT:    vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 14(%rdi), %ebx
+; AVX-NEXT:    andl $15, %ebx
+; AVX-NEXT:    movzbl 15(%rdi), %edi
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    movzbl (%rdi,%rbp), %r10d
+; AVX-NEXT:    movzbl (%rbx,%rbp), %r9d
+; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
 ; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
-; AVX-NEXT:    vmovd %esi, %xmm0
-; AVX-NEXT:    movsbq 13(%rdi), %rsi
-; AVX-NEXT:    vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
-; AVX-NEXT:    movsbq 14(%rdi), %r15
-; AVX-NEXT:    movsbq 15(%rdi), %rdi
-; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
-; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
-; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
-; AVX-NEXT:    movzbl (%rbx,%rbp), %ebx
+; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
 ; AVX-NEXT:    movzbl (%rcx,%rbp), %ecx
 ; AVX-NEXT:    movzbl (%rax,%rbp), %eax
-; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
+; AVX-NEXT:    movzbl (%r8,%rbp), %r8d
 ; AVX-NEXT:    movzbl (%r13,%rbp), %r13d
 ; AVX-NEXT:    movzbl (%r12,%rbp), %r12d
+; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
 ; AVX-NEXT:    movzbl (%r14,%rbp), %r14d
-; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
-; AVX-NEXT:    movzbl (%r10,%rbp), %r10d
-; AVX-NEXT:    movzbl (%r9,%rbp), %r9d
-; AVX-NEXT:    movzbl (%r8,%rbp), %ebp
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; AVX-NEXT:    movzbl (%rbx,%rbp), %ebp
 ; AVX-NEXT:    vpinsrb $2, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %r12d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %r13d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %r15d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $8, %r8d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %r10d, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -1106,11 +1272,14 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movslq %edi, %rax
+; SSE-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %edx, %rdx
+; SSE-NEXT:    andl $3, %edx
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %ecx, %rcx
+; SSE-NEXT:    andl $3, %ecx
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1120,11 +1289,14 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
 ;
 ; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1151,31 +1323,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT:    movswq %di, %r10
-; SSE2-NEXT:    movswq %si, %rsi
-; SSE2-NEXT:    movswq %dx, %r11
-; SSE2-NEXT:    movswq %cx, %rcx
+; SSE2-NEXT:    andl $7, %edi
+; SSE2-NEXT:    andl $7, %esi
+; SSE2-NEXT:    andl $7, %edx
+; SSE2-NEXT:    andl $7, %ecx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r8w, %rdi
+; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r9w, %rax
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    andl $7, %r9d
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movd %esi, %xmm0
 ; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %esi, %xmm2
-; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzwl -40(%rsp,%r10,2), %eax
-; SSE2-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
+; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1190,31 +1362,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT:    movswq %di, %r10
-; SSSE3-NEXT:    movswq %si, %rsi
-; SSSE3-NEXT:    movswq %dx, %r11
-; SSSE3-NEXT:    movswq %cx, %rcx
+; SSSE3-NEXT:    andl $7, %edi
+; SSSE3-NEXT:    andl $7, %esi
+; SSSE3-NEXT:    andl $7, %edx
+; SSSE3-NEXT:    andl $7, %ecx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r8w, %rdi
+; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r9w, %rax
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT:    xorl %edx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    andl $7, %r9d
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movd %esi, %xmm0
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movd %esi, %xmm2
-; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    movzwl -40(%rsp,%r10,2), %eax
-; SSSE3-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1229,21 +1401,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movswq %di, %rax
-; SSE41-NEXT:    movswq %si, %rsi
-; SSE41-NEXT:    movswq %dx, %rdx
-; SSE41-NEXT:    movswq %cx, %r10
+; SSE41-NEXT:    andl $7, %edi
+; SSE41-NEXT:    andl $7, %esi
+; SSE41-NEXT:    andl $7, %edx
+; SSE41-NEXT:    andl $7, %ecx
 ; SSE41-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r8w, %rdi
+; SSE41-NEXT:    andl $7, %r8d
 ; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r9w, %rcx
-; SSE41-NEXT:    movzwl -40(%rsp,%rax,2), %eax
+; SSE41-NEXT:    andl $7, %r9d
+; SSE41-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
 ; SSE41-NEXT:    movd %eax, %xmm1
 ; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm1
 ; SSE41-NEXT:    pinsrw $2, -40(%rsp,%rdx,2), %xmm1
-; SSE41-NEXT:    pinsrw $3, -24(%rsp,%r10,2), %xmm1
-; SSE41-NEXT:    pinsrw $4, -40(%rsp,%rdi,2), %xmm1
-; SSE41-NEXT:    pinsrw $5, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT:    pinsrw $4, -40(%rsp,%r8,2), %xmm1
+; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm1
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
@@ -1256,21 +1428,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT:    movswq %di, %r10
-; AVX1-NEXT:    movswq %si, %r11
-; AVX1-NEXT:    movswq %dx, %rdx
-; AVX1-NEXT:    movswq %cx, %rcx
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    andl $7, %ecx
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movswq %r8w, %rdi
+; AVX1-NEXT:    andl $7, %r8d
 ; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movswq %r9w, %rax
-; AVX1-NEXT:    movzwl -40(%rsp,%r10,2), %esi
-; AVX1-NEXT:    vmovd %esi, %xmm0
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
@@ -1283,21 +1455,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX2-NEXT:    movswq %di, %r10
-; AVX2-NEXT:    movswq %si, %r11
-; AVX2-NEXT:    movswq %dx, %rdx
-; AVX2-NEXT:    movswq %cx, %rcx
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    andl $7, %ecx
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movswq %r8w, %rdi
+; AVX2-NEXT:    andl $7, %r8d
 ; AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movswq %r9w, %rax
-; AVX2-NEXT:    movzwl -40(%rsp,%r10,2), %esi
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
index b43ec058ed91..42b3c11d3d6b 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -13,6 +13,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0,
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    andl $3, %ecx
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %esi
+; ALL-NEXT:    andl $3, %edi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -40,6 +44,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -62,6 +68,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
 ; ALL:       # BB#0:
+; ALL-NEXT:    andl $1, %ecx
+; ALL-NEXT:    andl $1, %edx
+; ALL-NEXT:    andl $1, %esi
+; ALL-NEXT:    andl $1, %edi
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -87,6 +97,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $3, %ecx
+; AVX1-NEXT:    andl $3, %edx
+; AVX1-NEXT:    andl $3, %esi
+; AVX1-NEXT:    andl $3, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -105,6 +119,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    andl $3, %ecx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    andl $3, %edi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -134,6 +152,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $3, %esi
+; AVX1-NEXT:    andl $3, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -150,6 +170,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    andl $3, %edi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -173,6 +195,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX1:       # BB#0:
+; AVX1-NEXT:    andl $1, %ecx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    andl $1, %edi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -185,6 +211,10 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i
 ;
 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    andl $1, %edi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -212,15 +242,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movslq %esi, %rsi
-; AVX1-NEXT:    movslq %edx, %rdx
-; AVX1-NEXT:    movslq %ecx, %r11
-; AVX1-NEXT:    movslq %r8d, %r10
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    andl $7, %r8d
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movslq %r9d, %r8
-; AVX1-NEXT:    movslq 16(%rbp), %rdi
-; AVX1-NEXT:    movslq 24(%rbp), %rcx
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    movl 16(%rbp), %r10d
+; AVX1-NEXT:    andl $7, %r10d
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -284,15 +322,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
 ; ALL:       # BB#0:
-; ALL-NEXT:    movslq %edi, %rax
-; ALL-NEXT:    movslq %esi, %rsi
-; ALL-NEXT:    movslq %edx, %rdx
-; ALL-NEXT:    movslq %ecx, %r11
-; ALL-NEXT:    movslq %r8d, %r10
+; ALL-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    andl $3, %edi
+; ALL-NEXT:    andl $3, %esi
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %ecx
+; ALL-NEXT:    andl $3, %r8d
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    movslq %r9d, %r8
-; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rdi
-; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rcx
+; ALL-NEXT:    andl $3, %r9d
+; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; ALL-NEXT:    andl $3, %r10d
+; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -331,48 +377,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movslq 32(%rbp), %rax
+; AVX1-NEXT:    movl 32(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    movslq 40(%rbp), %rax
+; AVX1-NEXT:    movl 40(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 48(%rbp), %rax
+; AVX1-NEXT:    movl 48(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 56(%rbp), %rax
+; AVX1-NEXT:    movl 56(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 64(%rbp), %rax
+; AVX1-NEXT:    movl 64(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 72(%rbp), %rax
+; AVX1-NEXT:    movl 72(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 80(%rbp), %rax
+; AVX1-NEXT:    movl 80(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 88(%rbp), %rax
+; AVX1-NEXT:    movl 88(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    movslq %esi, %rax
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %edx, %rax
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %ecx, %rax
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r8d, %rax
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r9d, %rax
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq 16(%rbp), %rax
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r8d
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r9d
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl 16(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    movslq 24(%rbp), %rax
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -386,48 +448,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX2-NEXT:    movslq 32(%rbp), %rax
+; AVX2-NEXT:    movl 32(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    movslq 40(%rbp), %rax
+; AVX2-NEXT:    movl 40(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 48(%rbp), %rax
+; AVX2-NEXT:    movl 48(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 56(%rbp), %rax
+; AVX2-NEXT:    movl 56(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 64(%rbp), %rax
+; AVX2-NEXT:    movl 64(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 72(%rbp), %rax
+; AVX2-NEXT:    movl 72(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 80(%rbp), %rax
+; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 88(%rbp), %rax
+; AVX2-NEXT:    movl 88(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq %edi, %rax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT:    andl $15, %edi
+; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    movslq %esi, %rax
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %edx, %rax
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %ecx, %rax
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r8d, %rax
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r9d, %rax
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq 16(%rbp), %rax
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl 16(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movslq 24(%rbp), %rax
+; AVX2-NEXT:    movl 24(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -472,48 +550,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
 ; AVX1:       # BB#0:
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    movslq %esi, %rax
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %edx, %rax
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %ecx, %rax
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r8d, %rax
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r9d, %rax
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r8d
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -521,48 +615,64 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ;
 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq %edi, %rax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    movslq %esi, %rax
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %edx, %rax
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %ecx, %rax
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r8d, %rax
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r9d, %rax
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -615,8 +725,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX1-NEXT:    subq $64, %rsp
 ; AVX1-NEXT:    movq (%rdi), %rax
 ; AVX1-NEXT:    movq 8(%rdi), %rcx
+; AVX1-NEXT:    andl $3, %eax
+; AVX1-NEXT:    andl $3, %ecx
 ; AVX1-NEXT:    movq 16(%rdi), %rdx
+; AVX1-NEXT:    andl $3, %edx
 ; AVX1-NEXT:    movq 24(%rdi), %rsi
+; AVX1-NEXT:    andl $3, %esi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -637,8 +751,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    movq (%rdi), %rax
 ; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    andl $3, %eax
+; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    andl $3, %edx
 ; AVX2-NEXT:    movq 24(%rdi), %rsi
+; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -674,8 +792,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    movq (%rdi), %rax
 ; AVX1-NEXT:    movq 8(%rdi), %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    movq 16(%rdi), %rdx
+; AVX1-NEXT:    andl $1, %edx
 ; AVX1-NEXT:    movq 24(%rdi), %rsi
+; AVX1-NEXT:    andl $1, %esi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -690,8 +812,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movq (%rdi), %rax
 ; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    movq 24(%rdi), %rsi
+; AVX2-NEXT:    andl $1, %esi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
index 8d2dbbdb5d24..c025ee874b2d 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -17,6 +17,7 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver3 | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=znver1 | FileCheck %s
 
 ; Verify that for the X86_64 processors that are known to have poor latency 
 ; double precision shift instructions we do not generate 'shld' or 'shrd'
diff --git a/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
index a62def35acc5..2185fbb845e5 100644
--- a/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
+++ b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
@@ -48,6 +48,43 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 %b.addr.0, !dbg !14
 }
 
+; When the commoned instructions have the same debug location, this location
+; should be used as the location of the common instruction.
+
+; Generated from source (with -mllvm -no-discriminators and -gno-column-info):
+
+; int test2(int a, int b) {
+;   if(a) b -= foo(); else b -= bar();
+;   return b;
+; }
+
+; CHECK: define i32 @test2
+; CHECK-LABEL: if.end:
+; CHECK: %[[PHI:.*]] = phi i32 [ %call1, %if.else ], [ %call, %if.then ]
+; CHECK: sub nsw i32 %b, %[[PHI]], !dbg ![[DBG:.*]]
+; CHECK: ret i32
+; CHECK: ![[DBG]] = !DILocation(line: 17, scope: !{{.*}})
+
+define i32 @test2(i32 %a, i32 %b) !dbg !15 {
+entry:
+  %tobool = icmp ne i32 %a, 0, !dbg !16
+  br i1 %tobool, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @foo(), !dbg !16
+  %sub = sub nsw i32 %b, %call, !dbg !16
+  br label %if.end, !dbg !16
+
+if.else:                                          ; preds = %entry
+  %call1 = call i32 @bar(), !dbg !16
+  %sub2 = sub nsw i32 %b, %call1, !dbg !16
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %b.addr.0 = phi i32 [ %sub, %if.then ], [ %sub2, %if.else ]
+  ret i32 %b.addr.0, !dbg !17
+}
+
 declare i32 @foo()
 declare i32 @bar()
 
@@ -68,3 +105,6 @@ declare i32 @bar()
 !12 = !DILocation(line: 12, column: 10, scope: !6)
 !13 = !DILocation(line: 12, column: 7, scope: !6)
 !14 = !DILocation(line: 13, column: 3, scope: !6)
+!15 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 16, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!16 = !DILocation(line: 17, scope: !15)
+!17 = !DILocation(line: 18, scope: !15)
diff --git a/test/DebugInfo/Inputs/implicit-const-test.o b/test/DebugInfo/Inputs/implicit-const-test.o
new file mode 100644
index 000000000000..eef5cc930313
--- /dev/null
+++ b/test/DebugInfo/Inputs/implicit-const-test.o
diff --git a/test/DebugInfo/dwarfdump-implicit-const.test b/test/DebugInfo/dwarfdump-implicit-const.test
new file mode 100644
index 000000000000..5458ada1a17a
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-implicit-const.test
@@ -0,0 +1,2 @@
+RUN: llvm-dwarfdump -debug-dump=abbrev %p/Inputs/implicit-const-test.o | FileCheck %s
+CHECK: DW_FORM_implicit_const -9223372036854775808
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
index a9ec00939504..7ef07d72200d 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
@@ -33,6 +33,9 @@ k:
 r:
 # R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
         .word  Q - .
+# R_AARCH64_PREL64
+        .p2align        3
+        .xword f - .
 
 # LE instructions read as BE
 # rtdyld-check: *{4}(g) = 0x6024e0d2
@@ -41,3 +44,4 @@ r:
 # rtdyld-check: *{4}(g + 12) = 0xe0bd99f2
 # rtdyld-check: *{8}k = f
 # rtdyld-check: *{4}r = (Q - r)[31:0]
+# rtdyld-check: *{8}(r + 8) = f - r - 8
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
index f9a03ab40667..069170bdf36b 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
@@ -47,7 +47,10 @@ k:
         .size   k, 16
 r:
 # R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
-        .word  Q - .
+        .word  Q - .        
+# R_AARCH64_PREL64
+        .p2align        3
+        .xword f - .
 
 # rtdyld-check: *{4}(g) = 0xd2e02460
 # rtdyld-check: *{4}(g + 4) = 0xf2c8ace0
@@ -65,6 +68,7 @@ r:
 
 # rtdyld-check: *{8}k = f
 # rtdyld-check: *{4}r = (Q - r)[31:0]
+# rtdyld-check: *{8}(r + 8) = f - r - 8
 
 ## f & 0xFFF = 0xdef (bits 11:0 of f)
 ## 0xdef << 10 = 0x37bc00
diff --git a/test/FileCheck/match-full-lines.txt b/test/FileCheck/match-full-lines.txt
new file mode 100644
index 000000000000..d6b10a5e3a9b
--- /dev/null
+++ b/test/FileCheck/match-full-lines.txt
@@ -0,0 +1,53 @@
+// RUN: not FileCheck -match-full-lines -input-file %s %s  2>&1 \
+// RUN:   | FileCheck --check-prefix=ERROR --implicit-check-not=error: %s
+// RUN: not FileCheck -match-full-lines -strict-whitespace -input-file %s %s  2>&1 \
+// RUN:   | FileCheck --check-prefix=ERROR-STRICT --check-prefix=ERROR --implicit-check-not=error: %s
+
+Label 1
+a line
+trailing whitespace   
+trailing more whitespace   
+Label 2
+a line
+   leading whitespace
+   leading more whitespace
+
+Label 3
+a line
+
+Label 4
+a line
+a random thing
+
+Label 5
+Label 66
+
+// CHECK-LABEL:Label 1
+// CHECK:a line
+// CHECK:trailing whitespace
+// CHECK:trailing more whitespace   
+// ERROR-STRICT:error: expected string not found in input
+// ERROR-STRICT:// {{C}}HECK:trailing whitespace
+
+// CHECK-LABEL:Label 2
+// CHECK:a line
+// CHECK-NEXT:leading whitespace
+// CHECK-NEXT:   leading more whitespace
+// ERROR-STRICT:error: expected string not found in input
+// ERROR-STRICT:// {{C}}HECK-NEXT:leading whitespace
+
+// CHECK-LABEL:Label 3
+// CHECK:line
+// ERROR:error: expected string not found in input
+// ERROR:// {{C}}HECK:line
+
+// CHECK-LABEL:Label 4
+// CHECK:a line
+// CHECK-NOT:random
+// ERROR:error: {{C}}HECK-NOT: string occurred!
+// ERROR:a random thing
+
+// CHECK-LABEL:Label 5
+// CHECK-LABEL:Label 6
+// ERROR:error: expected string not found in input
+// ERROR:{{C}}HECK-LABEL:Label 6
diff --git a/test/FileCheck/strict-whitespace-match-full-lines.txt b/test/FileCheck/strict-whitespace-match-full-lines.txt
deleted file mode 100644
index b0b6135b05d3..000000000000
--- a/test/FileCheck/strict-whitespace-match-full-lines.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: sed 's/^;.*$//' %s \
-; RUN: | FileCheck --strict-whitespace --match-full-lines %s
-
-bla1
-bla2
- bla3
-bla4 
- bla5 
-
-; CHECK-LABEL:bla1
-; CHECK-NEXT:bla2
-; CHECK-NEXT: bla3
-; CHECK-NEXT:bla4 
-; CHECK-NEXT: bla5 
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
index af4da14d786f..5d510014a12a 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-apple-macosx10.11.0"
 
 
 ; Find the metadata for @global:
-; CHECK: [[METADATA:@.+]] = internal global {{.*}} @global {{.*}} section "__DATA,__asan_globals,regular", align 1
+; CHECK: [[METADATA:@.+]] = internal global {{.*}} @global {{.*}} section "__DATA,__asan_globals,regular", align 64
 
 ; Find the liveness binder for @global and its metadata:
 ; CHECK: @__asan_binder_global = internal global {{.*}} @global {{.*}} [[METADATA]] {{.*}} section "__DATA,__asan_liveness,regular,live_support"
diff --git a/test/MC/AMDGPU/vop_dpp.s b/test/MC/AMDGPU/vop_dpp.s
index 52b5df3a44c8..19f21c48ca21 100644
--- a/test/MC/AMDGPU/vop_dpp.s
+++ b/test/MC/AMDGPU/vop_dpp.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+
 // RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI
 
 //===----------------------------------------------------------------------===//
 // Check dpp_ctrl values
@@ -527,3 +529,31 @@ v_subb_u32 v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:
 // NOSICI: error:
 // VI: v_subbrev_u32_dpp v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x06,0x02,0x3c,0x02,0x01,0x09,0xa1]
 v_subbrev_u32 v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+//===----------------------------------------------------------------------===//
+// Check that immideates and scalar regs are not supported
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+v_mov_b32 v0, 1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_and_b32 v0, 42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_add_f32 v0, v1, 345 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_mov_b32 v0, s1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_and_b32 v0, s42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_add_f32 v0, v1, s45 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
diff --git a/test/MC/AMDGPU/vop_sdwa.s b/test/MC/AMDGPU/vop_sdwa.s
index f139e7c908ff..75db3259f43c 100644
--- a/test/MC/AMDGPU/vop_sdwa.s
+++ b/test/MC/AMDGPU/vop_sdwa.s
@@ -594,3 +594,39 @@ v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 // NOSICI: error:
 // VI: v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x02,0x04]
 v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+
+//===----------------------------------------------------------------------===//
+// Check that immideates and scalar regs are not supported
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_mov_b32 v0, 1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_and_b32 v0, 42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_add_f32 v0, v1, 345 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_cmpx_class_f32 vcc, -1, 200 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_mov_b32 v0, s1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_and_b32 v0, s42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_add_f32 v0, v1, s45 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_cmpx_class_f32 vcc, s1, s2 src0_sel:BYTE_2 src1_sel:WORD_0
diff --git a/test/MC/ARM/directive-object_arch-2.s b/test/MC/ARM/directive-object_arch-2.s
index 3aca434a1e4c..f0596f12385c 100644
--- a/test/MC/ARM/directive-object_arch-2.s
+++ b/test/MC/ARM/directive-object_arch-2.s
@@ -14,7 +14,7 @@
 @ CHECK:   }
 @ CHECK:   Attribute {
 @ CHECK:     Tag: 6
-@ CHEKC:     Value: 1
+@ CHECK:     Value: 1
 @ CHECK:     TagName: CPU_arch
 @ CHECK:     Description: ARM v4
 @ CHECK:   }
diff --git a/test/MC/ARM/directive-object_arch.s b/test/MC/ARM/directive-object_arch.s
index 0707077630e0..c211a3bb57e1 100644
--- a/test/MC/ARM/directive-object_arch.s
+++ b/test/MC/ARM/directive-object_arch.s
@@ -14,7 +14,7 @@
 @ CHECK:   }
 @ CHECK:   Attribute {
 @ CHECK:     Tag: 6
-@ CHEKC:     Value: 1
+@ CHECK:     Value: 1
 @ CHECK:     TagName: CPU_arch
 @ CHECK:     Description: ARM v4
 @ CHECK:   }
diff --git a/test/ObjectYAML/MachO/DWARF-debug_info.yaml b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
index 9a616e9afb9d..b1b6b8ad19e8 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_info.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
@@ -451,6 +451,58 @@ DWARF:
             - Value:           0x0000000000000001
         - AbbrCode:        0x00000000
           Values:          
+  debug_line:      
+    - TotalLength:     65
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: 
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 0
+        - 0
+        - 0
+        - 1
+        - 0
+        - 0
+        - 1
+      IncludeDirs:     
+      Files:           
+        - Name:            hello_world.c
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:         
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294971216
+        - Opcode:          0x14
+          Data:            4294971216
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            3
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          DW_LNS_advance_pc
+          Data:            11
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            11
+...
 ...
 
 
diff --git a/test/ObjectYAML/MachO/DWARF-debug_line.yaml b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
new file mode 100644
index 000000000000..c1e015839f97
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
@@ -0,0 +1,595 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           7
+  sizeofcmds:      1848
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            B4D48511-37F4-3ED4-AFA7-1683DCE69AC4
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           2
+    stroff:          4128
+    strsize:         28
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            121
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212C
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002178
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002206
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+LinkEditData:    
+  NameList:        
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971216
+  StringTable:     
+    - ''
+    - ''
+    - __mh_execute_header
+    - _main
+DWARF:           
+  debug_str:       
+    - ''
+    - 'clang version 4.0.0 (trunk 288923) (llvm/trunk 288991)'
+    - hello_world.c
+    - /Users/cbieneman/dev/open-source/llvm-build-rel
+    - main
+    - argc
+    - argv
+    - int
+    - char
+  debug_abbrev:    
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_producer
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_language
+          Form:            DW_FORM_data2
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_stmt_list
+          Form:            DW_FORM_sec_offset
+        - Attribute:       DW_AT_comp_dir
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+    - Code:            0x00000002
+      Tag:             DW_TAG_subprogram
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+        - Attribute:       DW_AT_frame_base
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_prototyped
+          Form:            DW_FORM_flag_present
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+        - Attribute:       DW_AT_external
+          Form:            DW_FORM_flag_present
+    - Code:            0x00000003
+      Tag:             DW_TAG_formal_parameter
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_location
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+    - Code:            0x00000004
+      Tag:             DW_TAG_base_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_encoding
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_byte_size
+          Form:            DW_FORM_data1
+    - Code:            0x00000005
+      Tag:             DW_TAG_pointer_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+  debug_aranges:   
+    - Length:          44
+      Version:         2
+      CuOffset:        0
+      AddrSize:        8
+      SegSize:         0
+      Descriptors:     
+        - Address:         0x0000000100000F50
+          Length:          52
+  debug_pubnames:  
+    Length:          23
+    Version:         2
+    UnitOffset:      0
+    UnitSize:        121
+    Entries:         
+      - DieOffset:       0x0000002A
+        Name:            main
+  debug_pubtypes:  
+    Length:          31
+    Version:         2
+    UnitOffset:      0
+    UnitSize:        121
+    Entries:         
+      - DieOffset:       0x00000060
+        Name:            int
+      - DieOffset:       0x00000071
+        Name:            char
+  debug_info:      
+    - Length:          117
+      Version:         4
+      AbbrOffset:      0
+      AddrSize:        8
+      Entries:         
+        - AbbrCode:        0x00000001
+          Values:          
+            - Value:           0x0000000000000001
+            - Value:           0x000000000000000C
+            - Value:           0x0000000000000038
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000046
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+        - AbbrCode:        0x00000002
+          Values:          
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+            - Value:           0x0000000000000001
+              BlockData:       
+                - 0x56
+            - Value:           0x0000000000000076
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000060
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x78
+            - Value:           0x000000000000007B
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000060
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x70
+            - Value:           0x0000000000000080
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000067
+        - AbbrCode:        0x00000000
+          Values:          
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000085
+            - Value:           0x0000000000000005
+            - Value:           0x0000000000000004
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x000000000000006C
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x0000000000000071
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000089
+            - Value:           0x0000000000000006
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000000
+          Values:          
+  debug_line:      
+    - TotalLength:     65
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: 
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 0
+        - 0
+        - 0
+        - 1
+        - 0
+        - 0
+        - 1
+      IncludeDirs:     
+      Files:           
+        - Name:            hello_world.c
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:         
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294971216
+        - Opcode:          0x14
+          Data:            4294971216
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            3
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          DW_LNS_advance_pc
+          Data:            11
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            11
+...
+
+#CHECK:   debug_line:      
+#CHECK:     - TotalLength:     65
+#CHECK:       Version:         2
+#CHECK:       PrologueLength:  36
+#CHECK:       MinInstLength:   1
+#CHECK:       DefaultIsStmt:   1
+#CHECK:       LineBase:        251
+#CHECK:       LineRange:       14
+#CHECK:       OpcodeBase:      13
+#CHECK:       StandardOpcodeLengths: 
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:       IncludeDirs:     
+#CHECK:       Files:           
+#CHECK:         - Name:            hello_world.c
+#CHECK:           DirIdx:          0
+#CHECK:           ModTime:         0
+#CHECK:           Length:          0
+#CHECK:       Opcodes:         
+#CHECK:         - Opcode:          DW_LNS_extended_op
+#CHECK:           ExtLen:          9
+#CHECK:           SubOpcode:       DW_LNE_set_address
+#CHECK:           Data:            4294971216
+#CHECK:         - Opcode:          0x14
+#CHECK:           Data:            4294971216
+#CHECK:         - Opcode:          DW_LNS_set_column
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_set_prologue_end
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_const_add_pc
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          0xBB
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          0xBB
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_advance_pc
+#CHECK:           Data:            11
+#CHECK:         - Opcode:          DW_LNS_extended_op
+#CHECK:           ExtLen:          1
+#CHECK:           SubOpcode:       DW_LNE_end_sequence
+#CHECK:           Data:            11
+#CHECK: ...
diff --git a/test/Other/loop-pass-ordering.ll b/test/Other/loop-pass-ordering.ll
index ceda0d3869dd..ab3839f5cc99 100644
--- a/test/Other/loop-pass-ordering.ll
+++ b/test/Other/loop-pass-ordering.ll
@@ -8,11 +8,12 @@
 ;      /      \        \
 ; loop.0.0  loop.0.1  loop.1.0
 ;
-; CHECK: Running pass: NoOpLoopPass on loop.1.0
-; CHECK: Running pass: NoOpLoopPass on loop.1
-; CHECK: Running pass: NoOpLoopPass on loop.0.0
-; CHECK: Running pass: NoOpLoopPass on loop.0.1
-; CHECK: Running pass: NoOpLoopPass on loop.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.0.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.0.1
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 1 containing: %loop.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.1.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 1 containing: %loop.1
+
 define void @f() {
 entry:
   br label %loop.0
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index 6224af09a3f1..eae2d855e92f 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -433,12 +433,12 @@
 ; CHECK-O: Running pass: TailCallElimPass
 ; CHECK-O: Running pass: SimplifyCFGPass
 ; CHECK-O: Running pass: ReassociatePass
-; CHECK-O: Starting llvm::Loop pass manager run.
-; CHECK-O: Finished llvm::Loop pass manager run.
+; CHECK-O: Starting Loop pass manager run.
+; CHECK-O: Finished Loop pass manager run.
 ; CHECK-O: Running pass: SimplifyCFGPass
 ; CHECK-O: Running pass: InstCombinePass
-; CHECK-O: Starting llvm::Loop pass manager run.
-; CHECK-O: Finished llvm::Loop pass manager run.
+; CHECK-O: Starting Loop pass manager run.
+; CHECK-O: Finished Loop pass manager run.
 ; CHECK-O: Running pass: MemCpyOptPass
 ; CHECK-O: Running pass: SCCPPass
 ; CHECK-O: Running pass: BDCEPass
@@ -544,20 +544,21 @@
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: DominatorTreeAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AAManager
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetLibraryAnalysis
-; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AssumptionAnalysis
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: RepeatedPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Function pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Module pass manager run
 
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index ad222dbef7ab..b303318c7963 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -144,10 +144,10 @@
 ; CHECK-TWO-NOOP-LOOP: Running pass: ModuleToFunctionPassAdaptor
 ; CHECK-TWO-NOOP-LOOP: Starting llvm::Function pass manager run
 ; CHECK-TWO-NOOP-LOOP: Running pass: FunctionToLoopPassAdaptor
-; CHECK-TWO-NOOP-LOOP: Starting llvm::Loop pass manager run
+; CHECK-TWO-NOOP-LOOP: Starting Loop pass manager run
 ; CHECK-TWO-NOOP-LOOP: Running pass: NoOpLoopPass
 ; CHECK-TWO-NOOP-LOOP: Running pass: NoOpLoopPass
-; CHECK-TWO-NOOP-LOOP: Finished llvm::Loop pass manager run
+; CHECK-TWO-NOOP-LOOP: Finished Loop pass manager run
 ; CHECK-TWO-NOOP-LOOP: Finished llvm::Function pass manager run
 ; CHECK-TWO-NOOP-LOOP: Finished llvm::Module pass manager run
 
@@ -167,9 +167,9 @@
 ; CHECK-NESTED-FP-LP: Running pass: ModuleToFunctionPassAdaptor
 ; CHECK-NESTED-FP-LP: Starting llvm::Function pass manager run
 ; CHECK-NESTED-FP-LP: Running pass: FunctionToLoopPassAdaptor
-; CHECK-NESTED-FP-LP: Starting llvm::Loop pass manager run
+; CHECK-NESTED-FP-LP: Starting Loop pass manager run
 ; CHECK-NESTED-FP-LP: Running pass: NoOpLoopPass
-; CHECK-NESTED-FP-LP: Finished llvm::Loop pass manager run
+; CHECK-NESTED-FP-LP: Finished Loop pass manager run
 ; CHECK-NESTED-FP-LP: Finished llvm::Function pass manager run
 ; CHECK-NESTED-FP-LP: Finished llvm::Module pass manager run
 
diff --git a/test/Transforms/GVN/assume-equal.ll b/test/Transforms/GVN/assume-equal.ll
index d423c1685e1d..941f14ce402c 100644
--- a/test/Transforms/GVN/assume-equal.ll
+++ b/test/Transforms/GVN/assume-equal.ll
@@ -65,22 +65,20 @@ if.then:                                          ; preds = %entry
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll
index d0b32d7f3dd8..6f1f357cad65 100644
--- a/test/Transforms/GVN/invariant.group.ll
+++ b/test/Transforms/GVN/invariant.group.ll
@@ -392,6 +392,44 @@ define void @testNotGlobal() {
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+  ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+  ; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)
diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll
index 6b5f5a949530..aee853ae9eeb 100644
--- a/test/Transforms/InstCombine/fabs.ll
+++ b/test/Transforms/InstCombine/fabs.ll
@@ -5,6 +5,8 @@
 declare float @fabsf(float)
 declare double @fabs(double)
 declare fp128 @fabsl(fp128)
+declare float @llvm.fma.f32(float, float, float)
+declare float @llvm.fmuladd.f32(float, float, float)
 
 define float @square_fabs_call_f32(float %x) {
   %mul = fmul float %x, %x
@@ -80,7 +82,6 @@ define fp128 @square_fabs_intrinsic_f128(fp128 %x) {
 ; CHECK-NEXT: ret fp128 %fabsl
 }
 
-; TODO: This should be able to elimnated the fabs
 define float @square_nnan_fabs_intrinsic_f32(float %x) {
   %mul = fmul nnan float %x, %x
   %fabsf = call float @llvm.fabs.f32(float %mul)
@@ -88,8 +89,7 @@ define float @square_nnan_fabs_intrinsic_f32(float %x) {
 
 ; CHECK-LABEL: square_nnan_fabs_intrinsic_f32(
 ; CHECK-NEXT: %mul = fmul nnan float %x, %x
-; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %mul)
-; CHECK-NEXT: ret float %fabsf
+; CHECK-NEXT: ret float %mul
 }
 
 ; Shrinking a library call to a smaller type should not be inhibited by nor inhibit the square optimization.
@@ -170,3 +170,47 @@ define float @fabs_select_var_constant_negative(i32 %c, float %x) {
   %fabs = call float @llvm.fabs.f32(float %select)
   ret float %fabs
 }
+
+; The fabs cannot be eliminated because %x may be a NaN
+define float @square_fma_fabs_intrinsic_f32(float %x) {
+  %fma = call float @llvm.fma.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fma)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_fma_fabs_intrinsic_f32(
+; CHECK-NEXT: %fma = call float @llvm.fma.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %fma)
+; CHECK-NEXT: ret float %fabsf
+}
+
+; The fabs cannot be eliminated because %x may be a NaN
+define float @square_nnan_fma_fabs_intrinsic_f32(float %x) {
+  %fma = call nnan float @llvm.fma.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fma)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_nnan_fma_fabs_intrinsic_f32(
+; CHECK-NEXT: %fma = call nnan float @llvm.fma.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float %fma
+}
+
+define float @square_fmuladd_fabs_intrinsic_f32(float %x) {
+  %fmuladd = call float @llvm.fmuladd.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_fmuladd_fabs_intrinsic_f32(
+; CHECK-NEXT: %fmuladd = call float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+; CHECK-NEXT: ret float %fabsf
+}
+
+define float @square_nnan_fmuladd_fabs_intrinsic_f32(float %x) {
+  %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_nnan_fmuladd_fabs_intrinsic_f32(
+; CHECK-NEXT: %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float %fmuladd
+}
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index 84f24ca0bf24..ad8a9247e4e1 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -241,7 +241,7 @@ define float @fmul2(float %f1) {
 ; X/C1 * C2 => X * (C2/C1) is disabled if X/C1 has multiple uses
 @fmul2_external = external global float
 define float @fmul2_disable(float %f1) {
-  %div = fdiv fast float 1.000000e+00, %f1 
+  %div = fdiv fast float 1.000000e+00, %f1
   store float %div, float* @fmul2_external
   %mul = fmul fast float %div, 2.000000e+00
   ret float %mul
@@ -672,8 +672,7 @@ define double @sqrt_intrinsic_arg_4th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_4th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
-; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
-; CHECK-NEXT: ret double %fabs
+; CHECK-NEXT: ret double %mul
 }
 
 define double @sqrt_intrinsic_arg_5th(double %x) {
@@ -685,9 +684,8 @@ define double @sqrt_intrinsic_arg_5th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_5th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
-; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
 ; CHECK-NEXT: %sqrt1 = call fast double @llvm.sqrt.f64(double %x)
-; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: %1 = fmul fast double %mul, %sqrt1
 ; CHECK-NEXT: ret double %1
 }
 
diff --git a/test/Transforms/InstCombine/fdiv.ll b/test/Transforms/InstCombine/fdiv.ll
index af6a2401a8fc..9a10c4523351 100644
--- a/test/Transforms/InstCombine/fdiv.ll
+++ b/test/Transforms/InstCombine/fdiv.ll
@@ -49,3 +49,21 @@ define float @test6(float %x, float %y, float %z) nounwind readnone ssp {
 ; CHECK-NEXT: fmul fast
 ; CHECK-NEXT: fdiv fast
 }
+
+; CHECK-LABEL @fdiv_fneg_fneg(
+; CHECK: %div = fdiv float %x, %y
+define float @fdiv_fneg_fneg(float %x, float %y) {
+  %x.fneg = fsub float -0.0, %x
+  %y.fneg = fsub float -0.0, %y
+  %div = fdiv float %x.fneg, %y.fneg
+  ret float %div
+}
+
+; CHECK-LABEL @fdiv_fneg_fneg_fast(
+; CHECK: %div = fdiv fast float %x, %y
+define float @fdiv_fneg_fneg_fast(float %x, float %y) {
+  %x.fneg = fsub float -0.0, %x
+  %y.fneg = fsub float -0.0, %y
+  %div = fdiv fast float %x.fneg, %y.fneg
+  ret float %div
+}
diff --git a/test/Transforms/InstCombine/pow-4.ll b/test/Transforms/InstCombine/pow-4.ll
index 911ab4d94c6a..9293f14cb106 100644
--- a/test/Transforms/InstCombine/pow-4.ll
+++ b/test/Transforms/InstCombine/pow-4.ll
@@ -10,8 +10,8 @@ declare float @llvm.pow.f32(float, float)
 define float @test_simplify_4f(float %x) {
 ; CHECK-LABEL: @test_simplify_4f(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul float %x, %x
-; CHECK-NEXT: %2 = fmul float %1, %1
+; CHECK-NEXT: %1 = fmul fast float %x, %x
+; CHECK-NEXT: %2 = fmul fast float %1, %1
 ; CHECK-NEXT: ret float %2
   %1 = call fast float @llvm.pow.f32(float %x, float 4.000000e+00)
   ret float %1
@@ -21,8 +21,8 @@ define float @test_simplify_4f(float %x) {
 define double @test_simplify_3(double %x) {
 ; CHECK-LABEL: @test_simplify_3(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %x
 ; CHECK-NEXT: ret double %2
   %1 = call fast double @llvm.pow.f64(double %x, double 3.000000e+00)
   ret double %1
@@ -32,8 +32,8 @@ define double @test_simplify_3(double %x) {
 define double @test_simplify_4(double %x) {
 ; CHECK-LABEL: @test_simplify_4(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
 ; CHECK-NEXT: ret double %2
   %1 = call fast double @llvm.pow.f64(double %x, double 4.000000e+00)
   ret double %1
@@ -43,11 +43,11 @@ define double @test_simplify_4(double %x) {
 define double @test_simplify_15(double %x) {
 ; CHECK-LABEL: @test_simplify_15(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %2, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %x
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %2, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double 1.500000e+01)
   ret double %1
@@ -57,11 +57,11 @@ define double @test_simplify_15(double %x) {
 define double @test_simplify_neg_7(double %x) {
 ; CHECK-LABEL: @test_simplify_neg_7(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
-; CHECK-NEXT: %3 = fmul double %1, %2
-; CHECK-NEXT: %4 = fmul double %1, %3
-; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %x
+; CHECK-NEXT: %4 = fmul fast double %1, %3
+; CHECK-NEXT: %5 = fdiv fast double 1.000000e+00, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double -7.000000e+00)
   ret double %1
@@ -71,13 +71,13 @@ define double @test_simplify_neg_7(double %x) {
 define double @test_simplify_neg_19(double %x) {
 ; CHECK-LABEL: @test_simplify_neg_19(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %1, %4
-; CHECK-NEXT: %6 = fmul double %5, %x
-; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %1, %4
+; CHECK-NEXT: %6 = fmul fast double %5, %x
+; CHECK-NEXT: %7 = fdiv fast double 1.000000e+00, %6
 ; CHECK-NEXT: ret double %7
   %1 = call fast double @llvm.pow.f64(double %x, double -1.900000e+01)
   ret double %1
@@ -97,11 +97,11 @@ define double @test_simplify_11_23(double %x) {
 define double @test_simplify_32(double %x) {
 ; CHECK-LABEL: @test_simplify_32(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %4, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %4, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double 3.200000e+01)
   ret double %1
diff --git a/test/Transforms/InstCombine/pow-sqrt.ll b/test/Transforms/InstCombine/pow-sqrt.ll
index 1e6166c5f114..52175f1b1247 100644
--- a/test/Transforms/InstCombine/pow-sqrt.ll
+++ b/test/Transforms/InstCombine/pow-sqrt.ll
@@ -9,5 +9,14 @@ define double @pow_half(double %x) {
 ; CHECK-NEXT:  %sqrt = call fast double @sqrt(double %x)
 ; CHECK-NEXT:  ret double %sqrt
 
-declare double @llvm.pow.f64(double, double)
+define double @pow_neghalf(double %x) {
+  %pow = call fast double @llvm.pow.f64(double %x, double -5.000000e-01)
+  ret double %pow
+}
 
+; CHECK-LABEL: define double @pow_neghalf(
+; CHECK-NEXT: %sqrt = call fast double @sqrt(double %x) #0
+; CHECK-NEXT: %sqrtrecip = fdiv fast double 1.000000e+00, %sqrt
+; CHECK-NEXT: ret double %sqrtrecip
+
+declare double @llvm.pow.f64(double, double)
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 21c9fdde1506..dfdb88dcc858 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -103,3 +103,95 @@ define float @PR22688(float %x) {
   ret float %7
 }
 
+declare float @llvm.fabs.f32(float)
+
+; CHECK-LABEL: @fabs_select_positive_constants(
+; CHECK: %select = select i1 %cmp, float 1.000000e+00, float 2.000000e+00
+; CHECK-NEXT: ret float %select
+define float @fabs_select_positive_constants(i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 1.0, float 2.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_constant_variable(
+; CHECK: %select = select i1 %cmp, float 1.000000e+00, float %x
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_constant_variable(i32 %c, float %x) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 1.0, float %x
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_neg0_pos0(
+; CHECK: %select = select i1 %cmp, float -0.000000e+00, float 0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+; CHECK-NEXT: ret float %fabs
+define float @fabs_select_neg0_pos0(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -0.0, float 0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_neg0_neg1(
+; CHECK: %select = select i1 %cmp, float -0.000000e+00, float -1.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_neg0_neg1(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -0.0, float -1.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_nan_nan(
+; CHECK: %select = select i1 %cmp, float 0x7FF8000000000000, float 0x7FF8000100000000
+; CHECK-NEXT: ret float %select
+define float @fabs_select_nan_nan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0x7FF8000000000000, float 0x7FF8000100000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_nan(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000000000000
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_nan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_negnan(
+; CHECK:  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000100000000
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_negnan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000100000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_negzero(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float -0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_negzero(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float -0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_zero(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float 0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_zero(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
diff --git a/test/Transforms/LICM/argmemonly-call.ll b/test/Transforms/LICM/argmemonly-call.ll
index 18d7f8351dca..fe7c6af6d6d9 100644
--- a/test/Transforms/LICM/argmemonly-call.ll
+++ b/test/Transforms/LICM/argmemonly-call.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 declare i32 @foo() readonly argmemonly nounwind
 declare i32 @foo2() readonly nounwind
 declare i32 @bar(i32* %loc2) readonly argmemonly nounwind
diff --git a/test/Transforms/LICM/assume.ll b/test/Transforms/LICM/assume.ll
index f6369ac659f0..c8c93ae89b91 100644
--- a/test/Transforms/LICM/assume.ll
+++ b/test/Transforms/LICM/assume.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basicaa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 define void @f_0(i1 %p) nounwind ssp {
 ; CHECK-LABEL: @f_0(
diff --git a/test/Transforms/LICM/atomics.ll b/test/Transforms/LICM/atomics.ll
index 5dcd4bb8c05a..d23cb49c5486 100644
--- a/test/Transforms/LICM/atomics.ll
+++ b/test/Transforms/LICM/atomics.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 ; Check that we can hoist unordered loads
 define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
diff --git a/test/Transforms/LICM/basictest.ll b/test/Transforms/LICM/basictest.ll
index 570e226d2372..78c87ce76517 100644
--- a/test/Transforms/LICM/basictest.ll
+++ b/test/Transforms/LICM/basictest.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i) {
 ; <label>:0
diff --git a/test/Transforms/LICM/constexpr.ll b/test/Transforms/LICM/constexpr.ll
index 726246776dc6..8ffc73513600 100644
--- a/test/Transforms/LICM/constexpr.ll
+++ b/test/Transforms/LICM/constexpr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; This fixes PR22460
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/LICM/crash.ll b/test/Transforms/LICM/crash.ll
index 75c27b8def0c..93ea2192e03e 100644
--- a/test/Transforms/LICM/crash.ll
+++ b/test/Transforms/LICM/crash.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -disable-output < %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -disable-output < %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LICM/debug-value.ll b/test/Transforms/LICM/debug-value.ll
index ab77caa2bae0..831a0d8b51f9 100644
--- a/test/Transforms/LICM/debug-value.ll
+++ b/test/Transforms/LICM/debug-value.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basicaa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 define void @dgefa() nounwind ssp {
 entry:
diff --git a/test/Transforms/LICM/extra-copies.ll b/test/Transforms/LICM/extra-copies.ll
index 84a3bc9ec6a6..2f8e814c15ee 100644
--- a/test/Transforms/LICM/extra-copies.ll
+++ b/test/Transforms/LICM/extra-copies.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; PR19835
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LICM/funclet.ll b/test/Transforms/LICM/funclet.ll
index 9bdc6dbcde88..6b5f11507ed9 100644
--- a/test/Transforms/LICM/funclet.ll
+++ b/test/Transforms/LICM/funclet.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i386-pc-windows-msvc18.0.0"
diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
index 5752aecde387..6ef00738820e 100644
--- a/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-deref-load.ll b/test/Transforms/LICM/hoist-deref-load.ll
index ed6ec7694d3c..e67becdeb5e4 100644
--- a/test/Transforms/LICM/hoist-deref-load.ll
+++ b/test/Transforms/LICM/hoist-deref-load.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index 081729f808bf..e9720235893a 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index a87709b810d2..9c6a3a180b50 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
 
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index c61131b476b9..29595b3e1cc0 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S | FileCheck %s
+; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 
 @X = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/Transforms/LICM/lcssa-ssa-promoter.ll b/test/Transforms/LICM/lcssa-ssa-promoter.ll
index d466b3baffc8..0644a627f718 100644
--- a/test/Transforms/LICM/lcssa-ssa-promoter.ll
+++ b/test/Transforms/LICM/lcssa-ssa-promoter.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s| FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s| FileCheck %s
 ;
 ; Manually validate LCSSA form is preserved even after SSAUpdater is used to
 ; promote things in the loop bodies.
diff --git a/test/Transforms/LICM/no-preheader-test.ll b/test/Transforms/LICM/no-preheader-test.ll
index 4b6847cdad51..5cfa462dfc4a 100644
--- a/test/Transforms/LICM/no-preheader-test.ll
+++ b/test/Transforms/LICM/no-preheader-test.ll
@@ -1,6 +1,6 @@
 ; Test that LICM works when there is not a loop-preheader
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i.s, i1 %ifcond) {
 	br i1 %ifcond, label %Then, label %Else
diff --git a/test/Transforms/LICM/opt-remarks-conditional-load.ll b/test/Transforms/LICM/opt-remarks-conditional-load.ll
new file mode 100644
index 000000000000..96bdeaff66ef
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks-conditional-load.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; With the load from %p conditional, we can't optmize this and the remark
+; should tell us about it.
+
+define void @test(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %else]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %c = icmp eq i32 %a, 0
+  br i1 %c, label %then, label %else
+
+then:
+; CHECK: remark: /tmp/kk.c:2:20: failed to hoist load with loop-invariant address because load is conditionally executed
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  br label %else
+
+else:
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
diff --git a/test/Transforms/LICM/opt-remarks-intervening-store.ll b/test/Transforms/LICM/opt-remarks-intervening-store.ll
new file mode 100644
index 000000000000..95389ceaf9a9
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks-intervening-store.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; Without the noalias on %p, we can't optmize this and the remark should tell
+; us about it.
+
+define void @test(i32* %array, i32* %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: failed to move load with loop-invariant address because the loop may invalidate its value
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+; This illustrates why we need to check loop-invariance before issuing this
+; remark.
+
+define i32 @invalidated_load_with_non_loop_invariant_address(i32* %array, i32* %array2) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+
+; CHECK-NOT: /tmp/kk.c:3:20: {{.*}} loop-invariant
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr, !dbg !9
+
+  %addr2 = getelementptr i32, i32* %array2, i32 %j
+  store i32 %j, i32* %addr2
+
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a2 = phi i32 [ %a, %Loop ]
+  ret i32 %a2
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 3, column: 20, scope: !6)
diff --git a/test/Transforms/LICM/opt-remarks.ll b/test/Transforms/LICM/opt-remarks.ll
new file mode 100644
index 000000000000..f0ef386c9f9a
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -licm -pass-remarks=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+define void @hoist(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: hosting load
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+define i32 @sink(i32* %array, i32* noalias %p, i32 %b) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:21: sinking add
+  %a3 = add i32 %a, 1, !dbg !9
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a4 = phi i32 [ %a3, %Loop ]
+  ret i32 %a4
+}
+
+define void @promote(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %b = load i32, i32* %p
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:22: Moving accesses to memory location out of the loop
+  store i32 %b, i32* %p, !dbg !10
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 21, scope: !6)
+!10 = !DILocation(line: 2, column: 22, scope: !6)
diff --git a/test/Transforms/LICM/preheader-safe.ll b/test/Transforms/LICM/preheader-safe.ll
index adc4f4237a29..8f82d1c68bb3 100644
--- a/test/Transforms/LICM/preheader-safe.ll
+++ b/test/Transforms/LICM/preheader-safe.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 declare void @use_nothrow(i64 %a) nounwind
 declare void @use(i64 %a)
diff --git a/test/Transforms/LICM/promote-order.ll b/test/Transforms/LICM/promote-order.ll
index 7d87bb221b76..b7e0b7c6c437 100644
--- a/test/Transforms/LICM/promote-order.ll
+++ b/test/Transforms/LICM/promote-order.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; LICM should keep the stores in their original order when it sinks/promotes them.
 ; rdar://12045203
diff --git a/test/Transforms/LICM/promote-tls.ll b/test/Transforms/LICM/promote-tls.ll
index 1849afade0e4..076d05cf094a 100644
--- a/test/Transforms/LICM/promote-tls.ll
+++ b/test/Transforms/LICM/promote-tls.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; If we can prove a local is thread local, we can insert stores during
 ; promotion which wouldn't be legal otherwise.
diff --git a/test/Transforms/LICM/scalar-promote-memmodel.ll b/test/Transforms/LICM/scalar-promote-memmodel.ll
index ceee7292ac5c..c09c2b361e02 100644
--- a/test/Transforms/LICM/scalar-promote-memmodel.ll
+++ b/test/Transforms/LICM/scalar-promote-memmodel.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; Make sure we don't hoist a conditionally-executed store out of the loop;
 ; it would violate the concurrency memory model
diff --git a/test/Transforms/LICM/scalar_promote-unwind.ll b/test/Transforms/LICM/scalar_promote-unwind.ll
index 22e7e50c22e5..dd3693b4af63 100644
--- a/test/Transforms/LICM/scalar_promote-unwind.ll
+++ b/test/Transforms/LICM/scalar_promote-unwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index dc5151be8a82..c88701154b8f 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -tbaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @X = global i32 7   ; <i32*> [#uses=4]
diff --git a/test/Transforms/LICM/speculate.ll b/test/Transforms/LICM/speculate.ll
index fed1cbaa8555..5d0108b129df 100644
--- a/test/Transforms/LICM/speculate.ll
+++ b/test/Transforms/LICM/speculate.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; UDiv is safe to speculate if the denominator is known non-zero.
 
diff --git a/test/Transforms/LICM/volatile-alias.ll b/test/Transforms/LICM/volatile-alias.ll
index 7836df004c0f..f387012015c7 100644
--- a/test/Transforms/LICM/volatile-alias.ll
+++ b/test/Transforms/LICM/volatile-alias.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -basicaa -sroa -loop-rotate -licm -S < %s | FileCheck %s
-; RUN: opt -basicaa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S | FileCheck %s
+; RUN: opt -basicaa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 ; The objects *p and *q are aliased to each other, but even though *q is
 ; volatile, *p can be considered invariant in the loop. Check if it is moved
 ; out of the loop.
diff --git a/test/Transforms/LoopSimplify/preserve-scev.ll b/test/Transforms/LoopSimplify/preserve-scev.ll
index f6fa8afc56b5..b78ce97fb46a 100644
--- a/test/Transforms/LoopSimplify/preserve-scev.ll
+++ b/test/Transforms/LoopSimplify/preserve-scev.ll
@@ -1,14 +1,38 @@
-; RUN: opt -S < %s -indvars | opt -analyze -iv-users | grep "%cmp = icmp slt i32" | grep "= {%\.ph,+,1}<%for.cond>"
-; PR8079
+; RUN: opt -S < %s -analyze -scalar-evolution -loop-simplify -scalar-evolution | FileCheck %s
 
 ; Provide legal integer types.
 target datalayout = "n8:16:32:64"
 
-; LoopSimplify should invalidate indvars when splitting out the
-; inner loop.
-
 @maxStat = external global i32
 
+; LoopSimplify should invalidate SCEV when splitting out the
+; inner loop.
+;
+; First SCEV print:
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: %[[PHI:.*]] = phi i32 [ 0, %entry ], [ %{{.*}}, %if.then5 ], [ %[[PHI]], %if.end ]
+; CHECK-LABEL: Determining loop execution counts for: @test
+; CHECK: Loop %for.body18: Unpredictable backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.cond: Unpredictable predicated backedge-taken count.
+;
+; Now simplify the loop, which should cause SCEV to re-compute more precise
+; info here in addition to having preheader PHIs. Second SCEV print:
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: phi i32 [ %{{.*}}, %if.then5 ], [ 0, %entry ]
+; CHECK-LABEL: Determining loop execution counts for: @test
+; CHECK: Loop %for.body18: Unpredictable backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond: max backedge-taken count is -2147483647
+; CHECK: Loop %for.cond: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.cond.outer: Unpredictable predicated backedge-taken count.
 define i32 @test() nounwind {
 entry:
   br label %for.cond
@@ -52,12 +76,27 @@ return:                                           ; preds = %for.body18, %for.bo
 
 declare void @foo() nounwind
 
-; Notify SCEV when removing an ExitingBlock.
-; CHECK-LABEL: @mergeExit(
-; CHECK: while.cond191:
-; CHECK: br i1 %or.cond, label %while.body197
-; CHECK-NOT: land.rhs:
-; CHECK: ret
+; Notify SCEV when removing an ExitingBlock. This only changes the
+; backedge-taken information.
+;
+; First SCEV print:
+; CHECK-LABEL: Determining loop execution counts for: @mergeExit
+; CHECK: Loop %while.cond191: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191: max backedge-taken count is -1
+; CHECK: Loop %while.cond191: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %while.cond191.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable predicated backedge-taken count.
+;
+; After simplifying, the max backedge count is refined.
+; Second SCEV print:
+; CHECK-LABEL: Determining loop execution counts for: @mergeExit
+; CHECK: Loop %while.cond191: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191: max backedge-taken count is 0
+; CHECK: Loop %while.cond191: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %while.cond191.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable predicated backedge-taken count.
 define void @mergeExit(i32 %MapAttrCount) nounwind uwtable ssp {
 entry:
   br i1 undef, label %if.then124, label %if.end126
diff --git a/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index 18309b0691fa..a87d5643e7e9 100644
--- a/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -3,7 +3,12 @@
 
 ; Make sure we use the profile information correctly to peel-off 3 iterations
 ; from the loop, and update the branch weights for the peeled loop properly.
+
+; CHECK: Loop Unroll: F[basic]
 ; CHECK: PEELING loop %for.body with iteration count 3!
+; CHECK: Loop Unroll: F[optsize]
+; CHECK-NOT: PEELING
+
 ; CHECK-LABEL: @basic
 ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !1
 ; CHECK: [[NEXT0]]:
@@ -37,6 +42,40 @@ for.end:                                          ; preds = %for.cond.for.end_cr
   ret void
 }
 
+; We don't want to peel loops when optimizing for size.
+; CHECK-LABEL: @optsize
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: br label %for.body
+; CHECK: for.body:
+; CHECK-NOT: br
+; CHECK: br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+define void @optsize(i32* %p, i32 %k) #1 !prof !0 {
+entry:
+  %cmp3 = icmp slt i32 0, %k
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+  store i32 %i.05, i32* %p.addr.04, align 4
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %k
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !1
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind optsize }
+
 !0 = !{!"function_entry_count", i64 1}
 !1 = !{!"branch_weights", i32 3001, i32 1001}
 
diff --git a/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
new file mode 100644
index 000000000000..ad79e38cafa0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,145 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
diff --git a/test/Transforms/LoopVectorize/pr31190.ll b/test/Transforms/LoopVectorize/pr31190.ll
new file mode 100644
index 000000000000..afb1754983cd
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr31190.ll
@@ -0,0 +1,64 @@
+; RUN: opt -passes='loop-vectorize' -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; This checks we don't crash when the inner loop we're trying to vectorize
+; is a SCEV AddRec with respect to an outer loop.
+
+; In this case, the problematic PHI is:
+; %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
+; we get the situation described above.
+
+; This test uses the new PM, because with the old PM, running loop-vectorize
+; would explicitly run loop-simplify. Even though this loop is already in
+; simplified form, loop-simplify would still clean up the phi.
+; The reason this matters is that in a real optimizer pipeline, LICM can create
+; such PHIs, and since it preserves loop simplified form, the cleanup has
+; no chance to run.
+
+; Code that leads to this situation can look something like:
+;
+; int a, b[1], c;
+; void fn1 ()
+; {
+;  for (; c; c++)
+;    for (a = 0; a; a++)
+;      b[c] = 4;
+; }
+;
+; The PHI is an artifact of the register promotion of c.
+
+@c = external global i32, align 4
+@a = external global i32, align 4
+@b = external global [1 x i32], align 4
+
+; CHECK: LV: PHI is a recurrence with respect to an outer loop.
+; CHECK: LV: Not vectorizing: Cannot prove legality.
+; CHECK-LABEL: @test
+define void @test() {
+entry:
+  %a.promoted2 = load i32, i32* @a, align 1
+  %c.promoted = load i32, i32* @c, align 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.for.inc4_crit_edge, %entry
+  %inc54 = phi i32 [ %inc5, %for.cond1.for.inc4_crit_edge ], [ %c.promoted, %entry ]
+  %inc.lcssa3 = phi i32 [ %inc.lcssa, %for.cond1.for.inc4_crit_edge ], [ %a.promoted2, %entry ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %inc1 = phi i32 [ %inc.lcssa3, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [1 x i32], [1 x i32]* @b, i64 0, i64 %idxprom
+  store i32 4, i32* %arrayidx, align 4
+  %inc = add nsw i32 %inc1, 1
+  %tobool2 = icmp eq i32 %inc, 0
+  br i1 %tobool2, label %for.cond1.for.inc4_crit_edge, label %for.body3
+
+for.cond1.for.inc4_crit_edge:                     ; preds = %for.body3
+  %inc.lcssa = phi i32 [ %inc, %for.body3 ]
+  %.lcssa = phi i32 [ %inc54, %for.body3 ]
+  %inc5 = add nsw i32 %.lcssa, 1
+  br label %for.cond1.preheader
+}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
index d2a3ef81a3a4..b7a1d208fc6f 100644
--- a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
+++ b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
@@ -6,5 +6,5 @@ TypeIdMap:
   typeid1:
     TTRes:
       Kind: Unsat
-      SizeBitWidth: 0
+      SizeM1BitWidth: 0
 ...
diff --git a/test/Transforms/LowerTypeTests/function.ll b/test/Transforms/LowerTypeTests/function.ll
index 9abea8f854c1..759041fea6f1 100644
--- a/test/Transforms/LowerTypeTests/function.ll
+++ b/test/Transforms/LowerTypeTests/function.ll
@@ -43,7 +43,7 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 define i1 @foo(i8* %p) {
   ; NATIVE: sub i64 {{.*}}, ptrtoint (void ()* @[[JT]] to i64)
   ; WASM32: sub i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
-  ; WASM32: icmp ult i64 {{.*}}, 2
+  ; WASM32: icmp ule i64 {{.*}}, 1
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ret i1 %x
 }
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 7ca70f2636fd..7410bc4b4d88 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -10,7 +10,7 @@
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            Unsat
-; SUMMARY-NEXT:       SizeBitWidth:    0
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/Transforms/LowerTypeTests/simple.ll b/test/Transforms/LowerTypeTests/simple.ll
index 91b94184420b..cedfcb4a63a0 100644
--- a/test/Transforms/LowerTypeTests/simple.ll
+++ b/test/Transforms/LowerTypeTests/simple.ll
@@ -69,7 +69,7 @@ define i1 @foo(i32* %p) {
   ; CHECK: [[R3:%[^ ]*]] = lshr i32 [[R2]], 2
   ; CHECK: [[R4:%[^ ]*]] = shl i32 [[R2]], 30
   ; CHECK: [[R5:%[^ ]*]] = or i32 [[R3]], [[R4]]
-  ; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 68
+  ; CHECK: [[R6:%[^ ]*]] = icmp ule i32 [[R5]], 67
   ; CHECK: br i1 [[R6]]
 
   ; CHECK: [[R8:%[^ ]*]] = getelementptr i8, i8* @bits_use.{{[0-9]*}}, i32 [[R5]]
@@ -96,7 +96,7 @@ define i1 @bar(i32* %p) {
   ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
   ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
   ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
-  ; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 2
+  ; CHECK: [[S6:%[^ ]*]] = icmp ule i32 [[S5]], 1
   %x = call i1 @llvm.type.test(i8* %pi8, metadata !"typeid2")
 
   ; CHECK: ret i1 [[S6]]
@@ -112,7 +112,7 @@ define i1 @baz(i32* %p) {
   ; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 2
   ; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 30
   ; CHECK: [[T5:%[^ ]*]] = or i32 [[T3]], [[T4]]
-  ; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 66
+  ; CHECK: [[T6:%[^ ]*]] = icmp ule i32 [[T5]], 65
   ; CHECK: br i1 [[T6]]
 
   ; CHECK: [[T8:%[^ ]*]] = getelementptr i8, i8* @bits_use{{(\.[0-9]*)?}}, i32 [[T5]]
diff --git a/test/Transforms/NewGVN/assume-equal.ll b/test/Transforms/NewGVN/assume-equal.ll
index b6c2a7afb294..7e009192064a 100644
--- a/test/Transforms/NewGVN/assume-equal.ll
+++ b/test/Transforms/NewGVN/assume-equal.ll
@@ -66,22 +66,20 @@ if.then:                                          ; preds = %entry
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
diff --git a/test/Transforms/NewGVN/invariant.group.ll b/test/Transforms/NewGVN/invariant.group.ll
index 80c6e05a8e24..c421df6bd3b1 100644
--- a/test/Transforms/NewGVN/invariant.group.ll
+++ b/test/Transforms/NewGVN/invariant.group.ll
@@ -393,6 +393,45 @@ define void @testNotGlobal() {
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+
+; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)
diff --git a/test/Transforms/NewGVN/pr31594.ll b/test/Transforms/NewGVN/pr31594.ll
new file mode 100644
index 000000000000..0cdac1a7fff4
--- /dev/null
+++ b/test/Transforms/NewGVN/pr31594.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @patatino(i8* %blah, i32 %choice) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[FOO:%.*]] = phi i8* [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    switch i32 [[CHOICE:%.*]], label [[WHILE_BODY]] [
+; CHECK-NEXT:    i32 -1, label [[WHILE_END:%.*]]
+; CHECK-NEXT:    i32 40, label [[LAND_END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       land.end:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.body:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    store i8 0, i8* [[FOO]], align 1
+; CHECK-NEXT:    store i8 0, i8* [[BLAH]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %foo = phi i8* [ %blah, %entry ], [ null, %while.body ]
+  switch i32 %choice, label %while.body [
+  i32 -1, label %while.end
+  i32 40, label %land.end
+  ]
+
+land.end:
+  br label %while.end
+
+while.body:
+  br label %while.cond
+
+while.end:
+  %foo.lcssa = phi i8* [ %foo, %land.end ], [ %foo, %while.cond ]
+;; These two stores will initially be considered equivalent, but then proven not.
+;; the second store would previously end up deciding it's equivalent to a previous
+;; store, but it was really just finding an optimistic version of itself
+;; in the congruence class.
+  store i8 0, i8* %foo.lcssa, align 1
+  %0 = load i8, i8* %blah, align 1
+  %loaded = icmp eq i8 %0, 0
+  store i8 0, i8* %blah, align 1
+  ret void
+}
+
+
+;; This is an example of a case where the memory states are equivalent solely due to unreachability,
+;; but the stores are not equal.
+define void @foo(i8* %arg) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i8* [ [[ARG:%.*]], [[BB:%.*]] ], [ null, [[BB2:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i8 0, i8* [[TMP]], !g !0
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    switch i8 0, label [[BB6]] [
+; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb9:
+; CHECK-NEXT:    store i8 0, i8* [[ARG]], !g !0
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  %tmp = phi i8* [ %arg, %bb ], [ null, %bb2 ]
+  br i1 undef, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb1
+  br label %bb1
+
+bb3:                                              ; preds = %bb1
+  store i8 0, i8* %tmp, !g !0
+  br label %bb4
+
+bb4:                                              ; preds = %bb8, %bb3
+  %tmp5 = phi i8* [ null, %bb8 ], [ %arg, %bb3 ]
+  br label %bb6
+
+bb6:                                              ; preds = %bb7, %bb4
+  br i1 undef, label %bb9, label %bb7
+
+bb7:                                              ; preds = %bb6
+  switch i8 0, label %bb6 [
+  i8 6, label %bb8
+  ]
+
+bb8:                                              ; preds = %bb7
+  store i8 undef, i8* %tmp5, !g !0
+  br label %bb4
+
+bb9:                                              ; preds = %bb6
+  %tmp10 = phi i8* [ %tmp5, %bb6 ]
+  store i8 0, i8* %tmp10, !g !0
+  unreachable
+}
+
+!0 = !{}
diff --git a/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext b/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext
new file mode 100644
index 000000000000..5bf67fb2bfaf
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext
@@ -0,0 +1,36 @@
+# IR level Instrumentation Flag
+:ir
+_Z3fooi
+# Func Hash:
+72057606922829823
+# Num Counters:
+2
+# Counter Values:
+18
+12
+
+_Z3fooi
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+0
+
+_Z3bari
+# Func Hash:
+72057606922829823
+# Num Counters:
+2
+# Counter Values:
+0
+0
+
+_Z4m2f1v
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+1
+
diff --git a/test/Transforms/PGOProfile/comdat_internal.ll b/test/Transforms/PGOProfile/comdat_internal.ll
index 25dafbea1035..7df6f91fe729 100644
--- a/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/test/Transforms/PGOProfile/comdat_internal.ll
@@ -4,17 +4,17 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 $foo = comdat any
-; CHECK: $foo.[[FOO_HASH:[0-9]+]] = comdat any
+; CHECK: $foo = comdat any
 
 ; CHECK: $__llvm_profile_raw_version = comdat any
-; CHECK: $__profv__stdin__foo.[[FOO_HASH]] = comdat any
+; CHECK: $__profv__stdin__foo.[[FOO_HASH:[0-9]+]] = comdat any
 
 @bar = global i32 ()* @foo, align 8
 
 ; CHECK: @__llvm_profile_raw_version = constant i64 {{[0-9]+}}, comdat
-; CHECK: @__profn__stdin__foo.[[FOO_HASH]] = private constant [23 x i8] c"<stdin>:foo.[[FOO_HASH]]"
+; CHECK: @__profn__stdin__foo = private constant [11 x i8] c"<stdin>:foo"
 ; CHECK: @__profc__stdin__foo.[[FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
-; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 6965568665848889497, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
+; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
 ; CHECK-NOT: bitcast (i32 ()* @foo to i8*)
 ; CHECK-SAME: , i8* null, i32 1, [1 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
diff --git a/test/Transforms/PGOProfile/comdat_rename.ll b/test/Transforms/PGOProfile/comdat_rename.ll
index b69c802093b4..eb9ddb4a1cea 100644
--- a/test/Transforms/PGOProfile/comdat_rename.ll
+++ b/test/Transforms/PGOProfile/comdat_rename.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -pgo-instr-gen -S | FileCheck --check-prefixes COMMON,ELFONLY %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=pgo-instr-gen -S | FileCheck --check-prefixes COMMON,ELFONLY %s
-; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -pgo-instr-gen -S | FileCheck --check-prefixes COMMON,COFFONLY %s
-; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -passes=pgo-instr-gen -S | FileCheck --check-prefixes COMMON,COFFONLY %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,ELFONLY %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,ELFONLY %s
+; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,COFFONLY %s
+; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -passes=pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,COFFONLY %s
 
 ; Rename Comdat group and its function.
 $f = comdat any
@@ -38,22 +38,10 @@ define linkonce void @tf2() comdat($tf) {
   ret void
 }
 
-; Renaming Comdat with aliases.
-$f_with_alias = comdat any
-; COMMON: $f_with_alias.[[SINGLEBB_HASH]] = comdat any
-@af = alias void (...), bitcast (void ()* @f_with_alias to void (...)*)
-; COFFONLY: @af.[[SINGLEBB_HASH]] = alias void (...), bitcast (void ()* @f_with_alias.[[SINGLEBB_HASH]] to
-; ELFONLY-DAG: @af.[[SINGLEBB_HASH]] = alias void (...), bitcast (void ()* @f_with_alias.[[SINGLEBB_HASH]] to
-define linkonce_odr void @f_with_alias() comdat($f_with_alias) {
-  ret void
-}
-
 ; Rename AvailableExternallyLinkage functions
 ; ELFONLY-DAG: $aef.[[SINGLEBB_HASH]] = comdat any
 
 ; ELFONLY: @f = weak alias void (), void ()* @f.[[SINGLEBB_HASH]]
-; ELFONLY: @f_with_alias = weak alias void (), void ()* @f_with_alias.[[SINGLEBB_HASH]]
-; ELFONLY: @af = weak alias void (...), void (...)* @af.[[SINGLEBB_HASH]]
 ; ELFONLY: @aef = weak alias void (), void ()* @aef.[[SINGLEBB_HASH]]
 
 define available_externally void @aef() {
diff --git a/test/Transforms/PGOProfile/indirect_call_profile.ll b/test/Transforms/PGOProfile/indirect_call_profile.ll
index 409c29ef8728..e1f499c08a7b 100644
--- a/test/Transforms/PGOProfile/indirect_call_profile.ll
+++ b/test/Transforms/PGOProfile/indirect_call_profile.ll
@@ -54,7 +54,7 @@ bb11:                                             ; preds = %bb2
 }
 
 ; Test that comdat function's address is recorded.
-; LOWER: @__profd_foo3.[[FOO3_HASH:[0-9]+]] = linkonce_odr{{.*}}@foo3.[[FOO3_HASH]]
+; LOWER: @__profd_foo3.[[FOO3_HASH:[0-9]+]] = linkonce_odr{{.*}}@__profc_foo3.[[FOO3_HASH]]
 ; Function Attrs: nounwind uwtable
 define linkonce_odr i32 @foo3()  comdat  {
   ret i32 1
diff --git a/test/Transforms/PGOProfile/multiple_hash_profile.ll b/test/Transforms/PGOProfile/multiple_hash_profile.ll
new file mode 100644
index 000000000000..f4041830f8f8
--- /dev/null
+++ b/test/Transforms/PGOProfile/multiple_hash_profile.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-profdata merge %S/Inputs/multiple_hash_profile.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata  -S | FileCheck %s
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooi = comdat any
+
+@g2 = local_unnamed_addr global i32 (i32)* null, align 8
+
+define i32 @_Z3bari(i32 %i) {
+entry:
+  %cmp = icmp sgt i32 %i, 2
+  %mul = select i1 %cmp, i32 1, i32 %i
+  %retval.0 = mul nsw i32 %mul, %i
+  ret i32 %retval.0
+}
+
+define void @_Z4m2f1v() {
+entry:
+  store i32 (i32)* @_Z3fooi, i32 (i32)** @g2, align 8
+  ret void
+}
+
+define linkonce_odr i32 @_Z3fooi(i32 %i) comdat {
+entry:
+  %cmp.i = icmp sgt i32 %i, 2
+  %mul.i = select i1 %cmp.i, i32 1, i32 %i
+; CHECK: %mul.i = select i1 %cmp.i, i32 1, i32 %i
+; CHECK-SAME !prof ![[BW:[0-9]+]]
+; CHECK ![[BW]] = !{!"branch_weights", i32 12, i32 6}
+  %retval.0.i = mul nsw i32 %mul.i, %i
+  ret i32 %retval.0.i
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/pr31599.ll b/test/Transforms/SLPVectorizer/X86/pr31599.ll
new file mode 100644
index 000000000000..64e0f7be7e2e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr31599.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x float> @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RES2]]
+;
+entry:
+  %source = insertelement <2 x float> undef, float undef, i32 0
+  %e0 = extractelement <2 x float> %source, i32 0
+  %e0.dup = extractelement <2 x float> %source, i32 0
+  %sub1 = fsub float %e0, %e0.dup
+  %e1 = extractelement <2 x float> %source, i32 1
+  %e1.dup = extractelement <2 x float> %source, i32 1
+  %sub2 = fsub float %e1, %e1.dup
+  %res1 = insertelement <2 x float> undef, float %sub1, i32 0
+  %res2 = insertelement <2 x float> %res1, float %sub2, i32 1
+  ret <2 x float> %res2
+}
+
+!llvm.ident = !{!0, !0}
+
+!0 = !{!"clang version 4.0.0 "}
diff --git a/test/Transforms/StructurizeCFG/no-branch-to-entry.ll b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
index 2e22c8715347..1db1060ca821 100644
--- a/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
+++ b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+; RUN: opt -S -o - -structurizecfg -verify-dom-info < %s | FileCheck %s
 
 ; CHECK-LABEL: @no_branch_to_entry_undef(
 ; CHECK: entry:
diff --git a/test/tools/llvm-config/booleans.test b/test/tools/llvm-config/booleans.test
new file mode 100644
index 000000000000..b28f293666f3
--- /dev/null
+++ b/test/tools/llvm-config/booleans.test
@@ -0,0 +1,28 @@
+# Check whether boolean options are consistently normalized to ON/OFF.
+RUN: llvm-config --assertion-mode 2>&1 | FileCheck --check-prefix=CHECK-ONOFF %s
+RUN: llvm-config --has-global-isel 2>&1 | FileCheck --check-prefix=CHECK-ONOFF %s
+CHECK-ONOFF: {{ON|OFF}}
+CHECK-ONOFF-NOT: error:
+CHECK-ONOFF-NOT: warning
+
+# ...or to YES/NO.
+RUN: llvm-config --has-rtti 2>&1 | FileCheck --check-prefix=CHECK-YESNO %s
+CHECK-YESNO: {{YES|NO}}
+CHECK-YESNO-NOT: error:
+CHECK-YESNO-NOT: warning
+
+# Also check some other multi-choice options.
+RUN: llvm-config --build-mode 2>&1 | FileCheck --check-prefix=CHECK-BUILD-MODE %s
+CHECK-BUILD-MODE: {{[Dd][Ee][Bb][Uu][Gg]|[Rr][Ee][Ll][Ee][Aa][Ss][Ee]|[Rr][Ee][Ll][Ww][Ii][Tt][Hh][Dd][Ee][Bb][Ii][Nn][Ff][Oo]|[Mm][Ii][Nn][Ss][Ii][Zz][Ee][Rr][Ee][Ll]}}
+CHECK-BUILD-MODE-NOT: error:
+CHECK-BUILD-MODE-NOT: warning
+
+RUN: llvm-config --build-system 2>&1 | FileCheck --check-prefix=CHECK-BUILD-SYSTEM %s
+CHECK-BUILD-SYSTEM: cmake
+CHECK-BUILD-SYSTEM-NOT: error:
+CHECK-BUILD-SYSTEM-NOT: warning
+
+RUN: llvm-config --shared-mode 2>&1 | FileCheck --check-prefix=CHECK-SHARED-MODE %s
+CHECK-SHARED-MODE: {{static|shared}}
+CHECK-SHARED-MODE-NOT: error:
+CHECK-SHARED-MODE-NOT: warning
diff --git a/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin b/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin
new file mode 100755
index 000000000000..4e1f982e2673
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin
diff --git a/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin b/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin
new file mode 100755
index 000000000000..fe31f9514d37
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin
diff --git a/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray b/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray
new file mode 100644
index 000000000000..284bc6b63e95
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray
diff --git a/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml b/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml
new file mode 100644
index 000000000000..e9c9f2e8d3c8
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml
@@ -0,0 +1,10 @@
+# This is a simple instrumentation map with bogus addresses and offsets, but
+# follow the recommended format.
+---
+- { id: 1, address: 0x1, function: 0x1, kind: function-enter, always-instrument: true}
+- { id: 1, address: 0x2, function: 0x1, kind: function-exit, always-instrument: true}
+- { id: 2, address: 0x2, function: 0x2, kind: function-enter, always-instrument: true}
+- { id: 2, address: 0x3, function: 0x2, kind: function-exit, always-instrument: true}
+- { id: 3, address: 0x3, function: 0x3, kind: function-enter, always-instrument: true}
+- { id: 3, address: 0x4, function: 0x3, kind: function-exit, always-instrument: true}
+...
diff --git a/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
new file mode 100644
index 000000000000..483d3e4f2c8f
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
@@ -0,0 +1,14 @@
+---
+- { id: 1, address: 0x000000000041CA40, function: 0x000000000041CA40, kind: function-enter, 
+    always-instrument: true }
+- { id: 1, address: 0x000000000041CA50, function: 0x000000000041CA40, kind: tail-exit, 
+    always-instrument: true }
+- { id: 2, address: 0x000000000041CA70, function: 0x000000000041CA70, kind: function-enter, 
+    always-instrument: true }
+- { id: 2, address: 0x000000000041CA7C, function: 0x000000000041CA70, kind: tail-exit, 
+    always-instrument: true }
+- { id: 3, address: 0x000000000041CAA0, function: 0x000000000041CAA0, kind: function-enter, 
+    always-instrument: true }
+- { id: 3, address: 0x000000000041CAB4, function: 0x000000000041CAA0, kind: function-exit, 
+    always-instrument: true }
+...
diff --git a/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
new file mode 100644
index 000000000000..6e926974141f
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
@@ -0,0 +1,36 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# Here we reconstruct the following call trace:
+#
+#   f1()
+#     f2()
+#       f3()
+#
+# But we find that we're missing an exit record for f2() because it's
+# tail-called f3(). We make sure that if we see a trace like this that we can
+# deduce tail calls, and account the time (potentially wrongly) to f2() when
+# f1() exits. That is because we don't go back to f3()'s entry record to
+# properly do the math on the timing of f2().
+#
+# Note that by default, tail/sibling call deduction is disabled, and is enabled
+# with a flag "-d" or "-deduce-sibling-calls".
+#
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit,  tsc: 10003 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,  tsc: 10004 }
+...
+
+#CHECK:       Functions with latencies: 3
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  1 1 [ 4.{{.*}}, 4.{{.*}}, 4.{{.*}}, 4.{{.*}}, 4.{{.*}}] {{.*}} {{.*}}
+#CHECK-NEXT:  2 1 [ 3.{{.*}}, 3.{{.*}}, 3.{{.*}}, 3.{{.*}}, 3.{{.*}}] {{.*}} {{.*}}
+#CHECK-NEXT:  3 1 [ 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-keep-going.yaml b/test/tools/llvm-xray/X86/account-keep-going.yaml
new file mode 100644
index 000000000000..1b234c0d7e8e
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -0,0 +1,20 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -k | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# We want to test the case for when we see spurious exits, but keep going
+# anyway ignoring the records in the process.
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit,  tsc: 10003 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,  tsc: 10004 }
+...
+#CHECK:       Functions with latencies: 1
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  3 1 [ 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-simple-case.yaml b/test/tools/llvm-xray/X86/account-simple-case.yaml
new file mode 100644
index 000000000000..82d83aae033e
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-simple-case.yaml
@@ -0,0 +1,18 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 2601000000
+records:
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+...
+
+#CHECK:       Functions with latencies: 1
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  1 1 [ {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-simple-sorting.yaml b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
new file mode 100644
index 000000000000..d25aef24a272
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
@@ -0,0 +1,85 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck --check-prefix DEFAULT %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count | FileCheck --check-prefix COUNT-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min | FileCheck --check-prefix MIN-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max | FileCheck --check-prefix MAX-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum | FileCheck --check-prefix SUM-ASC %s
+
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count -r dsc | FileCheck --check-prefix COUNT-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min -r dsc | FileCheck --check-prefix MIN-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max -r dsc | FileCheck --check-prefix MAX-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum -r dsc | FileCheck --check-prefix SUM-DSC %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 1
+records:
+  # Function id: 1
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10101 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10200 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10201 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10300 }
+  # Function id: 2
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
+    tsc: 10002 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
+    tsc: 10101 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
+    tsc: 10102 }
+
+#DEFAULT:       Functions with latencies: 2
+#DEFAULT-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#DEFAULT-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#DEFAULT-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#COUNT-ASC:       Functions with latencies: 2
+#COUNT-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#COUNT-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#COUNT-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#COUNT-DSC:       Functions with latencies: 2
+#COUNT-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#COUNT-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#COUNT-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MIN-ASC:       Functions with latencies: 2
+#MIN-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MIN-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MIN-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MIN-DSC:       Functions with latencies: 2
+#MIN-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MIN-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MIN-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MAX-ASC:       Functions with latencies: 2
+#MAX-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MAX-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MAX-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MAX-DSC:       Functions with latencies: 2
+#MAX-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MAX-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MAX-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#SUM-ASC:       Functions with latencies: 2
+#SUM-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#SUM-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#SUM-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#SUM-DSC:       Functions with latencies: 2
+#SUM-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#SUM-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#SUM-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/bad-instrmap-sizes.bin b/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
index 4ea33510e5dc..4ea33510e5dc 100644
--- a/test/tools/llvm-xray/X86/bad-instrmap-sizes.bin
+++ b/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
diff --git a/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
new file mode 100644
index 000000000000..844426284264
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -0,0 +1,28 @@
+#RUN: llvm-xray convert %s -f=raw -o %t && llvm-xray convert %t -f=yaml -o - | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 2601000000
+records:
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+...
+
+#CHECK:       ---
+#CHECK-NEXT:  header:
+#CHECK-NEXT:    version: 1
+#CHECK-NEXT:    type: 0
+#CHECK-NEXT:    constant-tsc: true
+#CHECK-NEXT:    nonstop-tsc: true
+#CHECK-NEXT:    cycle-frequency: 2601000000
+#CHECK-NEXT:  records:
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter,
+#CHECK-NEXT:      tsc: 10001 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit,
+#CHECK-NEXT:      tsc: 10100 }
+#CHECK-NEXT:  ...
diff --git a/test/tools/llvm-xray/X86/convert-to-yaml.txt b/test/tools/llvm-xray/X86/convert-to-yaml.txt
new file mode 100644
index 000000000000..c402bc18d83d
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
new file mode 100644
index 000000000000..ddb8b6bdb1cc
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
new file mode 100644
index 000000000000..71c17280df40
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
new file mode 100644
index 000000000000..01191c9c2a31
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml -t yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index ecd631c1039c..f1ec8a622671 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -205,7 +205,9 @@ public:
     Info.resize(OrigUnit.getNumDIEs());
 
     auto CUDie = OrigUnit.getUnitDIE(false);
-    unsigned Lang = CUDie.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_language, 0);
+    unsigned Lang =
+        CUDie.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_language)
+            .getValueOr(0);
     HasODR = CanUseODR && (Lang == dwarf::DW_LANG_C_plus_plus ||
                            Lang == dwarf::DW_LANG_C_plus_plus_03 ||
                            Lang == dwarf::DW_LANG_C_plus_plus_11 ||
@@ -1556,7 +1558,8 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     // Do not unique anything inside CU local functions.
     if ((Context.getTag() == dwarf::DW_TAG_namespace ||
          Context.getTag() == dwarf::DW_TAG_compile_unit) &&
-        !DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_external, 0))
+        !DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_external)
+             .getValueOr(0))
       return PointerIntPair<DeclContext *, 1>(nullptr);
     LLVM_FALLTHROUGH;
   case dwarf::DW_TAG_member:
@@ -1570,7 +1573,8 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     // created on demand. For example implicitely defined constructors
     // are ambiguous because of the way we identify contexts, and they
     // won't be generated everytime everywhere.
-    if (DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_artificial, 0))
+    if (DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_artificial)
+            .getValueOr(0))
       return PointerIntPair<DeclContext *, 1>(nullptr);
     break;
   }
@@ -1610,11 +1614,12 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     // namespaces, use these additional data points to make the process
     // safer.  This is disabled for clang modules, because forward
     // declarations of module-defined types do not have a file and line.
-    ByteSize = DIE.getAttributeValueAsUnsignedConstant(
-        dwarf::DW_AT_byte_size, UINT64_MAX);
+    ByteSize = DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_byte_size)
+                   .getValueOr(UINT64_MAX);
     if (Tag != dwarf::DW_TAG_namespace || !Name) {
-      if (unsigned FileNum = DIE.getAttributeValueAsUnsignedConstant(
-              dwarf::DW_AT_decl_file, 0)) {
+      if (unsigned FileNum =
+              DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_decl_file)
+                  .getValueOr(0)) {
         if (const auto *LT = U.getOrigUnit().getContext().getLineTableForUnit(
                 &U.getOrigUnit())) {
           // FIXME: dsymutil-classic compatibility. I'd rather not
@@ -1627,8 +1632,9 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
           // instead of "" would allow more uniquing, but for now, do
           // it this way to match dsymutil-classic.
           if (LT->hasFileAtIndex(FileNum)) {
-            Line = DIE.getAttributeValueAsUnsignedConstant(
-                dwarf::DW_AT_decl_line, 0);
+            Line =
+                DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_decl_line)
+                    .getValueOr(0);
             // Cache the resolved paths, because calling realpath is expansive.
             StringRef ResolvedPath = U.getResolvedPath(FileNum);
             if (!ResolvedPath.empty()) {
@@ -1803,9 +1809,10 @@ static bool analyzeContextInfo(const DWARFDie &DIE,
   // Prune this DIE if it is either a forward declaration inside a
   // DW_TAG_module or a DW_TAG_module that contains nothing but
   // forward declarations.
-  Info.Prune &= (DIE.getTag() == dwarf::DW_TAG_module) ||
-                DIE.getAttributeValueAsUnsignedConstant(
-                    dwarf::DW_AT_declaration, 0);
+  Info.Prune &=
+      (DIE.getTag() == dwarf::DW_TAG_module) ||
+      DIE.getAttributeValueAsUnsignedConstant(dwarf::DW_AT_declaration)
+          .getValueOr(0);
 
   // Don't prune it if there is no definition for the DIE.
   Info.Prune &= Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset();
@@ -2740,12 +2747,13 @@ DIE *DwarfLinker::DIECloner::cloneDIE(
     // independantly by the linker). The computation of the actual
     // high_pc value is done in cloneAddressAttribute().
     AttrInfo.OrigHighPc =
-        InputDIE.getAttributeValueAsAddress(dwarf::DW_AT_high_pc, 0);
+        InputDIE.getAttributeValueAsAddress(dwarf::DW_AT_high_pc).getValueOr(0);
     // Also store the low_pc. It might get relocated in an
     // inline_subprogram that happens at the beginning of its
     // inlining function.
     AttrInfo.OrigLowPc =
-        InputDIE.getAttributeValueAsAddress(dwarf::DW_AT_low_pc, UINT64_MAX);
+        InputDIE.getAttributeValueAsAddress(dwarf::DW_AT_low_pc)
+            .getValueOr(UINT64_MAX);
   }
 
   // Reset the Offset to 0 as we will be working on the local copy of
@@ -2864,8 +2872,9 @@ void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
   auto InvalidRange = FunctionRanges.end(), CurrRange = InvalidRange;
   DWARFUnit &OrigUnit = Unit.getOrigUnit();
   auto OrigUnitDie = OrigUnit.getUnitDIE(false);
-  uint64_t OrigLowPc = OrigUnitDie.getAttributeValueAsAddress(
-      dwarf::DW_AT_low_pc, -1ULL);
+  uint64_t OrigLowPc =
+      OrigUnitDie.getAttributeValueAsAddress(dwarf::DW_AT_low_pc)
+          .getValueOr(-1ULL);
   // Ranges addresses are based on the unit's low_pc. Compute the
   // offset we need to apply to adapt to the new unit's low_pc.
   int64_t UnitPcOffset = 0;
diff --git a/tools/llvm-config/BuildVariables.inc.in b/tools/llvm-config/BuildVariables.inc.in
index 709ea35044c6..0740c3f9d9f5 100644
--- a/tools/llvm-config/BuildVariables.inc.in
+++ b/tools/llvm-config/BuildVariables.inc.in
@@ -27,10 +27,10 @@
 #define LLVM_TARGETS_BUILT "@LLVM_TARGETS_BUILT@"
 #define LLVM_SYSTEM_LIBS "@LLVM_SYSTEM_LIBS@"
 #define LLVM_BUILD_SYSTEM "@LLVM_BUILD_SYSTEM@"
-#define LLVM_HAS_RTTI "@LLVM_HAS_RTTI@"
-#define LLVM_ENABLE_DYLIB "@LLVM_BUILD_LLVM_DYLIB@"
-#define LLVM_LINK_DYLIB "@LLVM_LINK_LLVM_DYLIB@"
-#define LLVM_ENABLE_SHARED "@LLVM_ENABLE_SHARED@"
+#define LLVM_HAS_RTTI @LLVM_HAS_RTTI@
+#define LLVM_ENABLE_DYLIB @LLVM_BUILD_LLVM_DYLIB@
+#define LLVM_LINK_DYLIB @LLVM_LINK_LLVM_DYLIB@
+#define LLVM_ENABLE_SHARED @BUILD_SHARED_LIBS@
 #define LLVM_DYLIB_COMPONENTS "@LLVM_DYLIB_COMPONENTS@"
 #define LLVM_DYLIB_VERSION "@LLVM_DYLIB_VERSION@"
-#define LLVM_HAS_GLOBAL_ISEL "@LLVM_HAS_GLOBAL_ISEL@"
+#define LLVM_HAS_GLOBAL_ISEL @LLVM_HAS_GLOBAL_ISEL@
diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index 1f5db59beb50..5112648ea731 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt
@@ -47,12 +47,13 @@ endif()
 set(LLVM_LDFLAGS "${CMAKE_CXX_LINK_FLAGS}")
 set(LLVM_BUILDMODE ${CMAKE_BUILD_TYPE})
 set(LLVM_SYSTEM_LIBS ${SYSTEM_LIBS})
-if(BUILD_SHARED_LIBS)
-  set(LLVM_ENABLE_SHARED ON)
-else()
-  set(LLVM_ENABLE_SHARED OFF)
-endif()
 string(REPLACE ";" " " LLVM_TARGETS_BUILT "${LLVM_TARGETS_TO_BUILD}")
+llvm_canonicalize_cmake_booleans(
+  LLVM_BUILD_LLVM_DYLIB
+  LLVM_LINK_LLVM_DYLIB
+  LLVM_HAS_RTTI
+  LLVM_HAS_GLOBAL_ISEL
+  BUILD_SHARED_LIBS)
 configure_file(${BUILDVARIABLES_SRCPATH} ${BUILDVARIABLES_OBJPATH} @ONLY)
 
 # Set build-time environment(s).
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index e8afcbaaf485..25344e4cd011 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -212,7 +212,7 @@ Options:\n\
   --assertion-mode  Print assertion mode of LLVM tree (ON or OFF).\n\
   --build-system    Print the build system used to build LLVM (always cmake).\n\
   --has-rtti        Print whether or not LLVM was built with rtti (YES or NO).\n\
-  --has-global-isel Print whether or not LLVM was built with global-isel support (YES or NO).\n\
+  --has-global-isel Print whether or not LLVM was built with global-isel support (ON or OFF).\n\
   --shared-mode     Print how the provided components can be collectively linked (`shared` or `static`).\n\
   --link-shared     Link the components as shared libraries.\n\
   --link-static     Link the component libraries statically.\n\
@@ -242,7 +242,7 @@ std::vector<std::string> GetAllDyLibComponents(const bool IsInDevelopmentTree,
   size_t Offset = 0;
   while (true) {
     const size_t NextOffset = DyLibComponentsStr.find(';', Offset);
-    DyLibComponents.push_back(DyLibComponentsStr.substr(Offset, NextOffset));
+    DyLibComponents.push_back(DyLibComponentsStr.substr(Offset, NextOffset-Offset));
     if (NextOffset == std::string::npos) {
       break;
     }
@@ -383,10 +383,10 @@ int main(int argc, char **argv) {
     StaticPrefix = SharedPrefix = "lib";
   }
 
-  const bool BuiltDyLib = (std::strcmp(LLVM_ENABLE_DYLIB, "ON") == 0);
+  const bool BuiltDyLib = !!LLVM_ENABLE_DYLIB;
 
   /// CMake style shared libs, ie each component is in a shared library.
-  const bool BuiltSharedLibs = std::strcmp(LLVM_ENABLE_SHARED, "ON") == 0;
+  const bool BuiltSharedLibs = !!LLVM_ENABLE_SHARED;
 
   bool DyLibExists = false;
   const std::string DyLibName =
@@ -395,7 +395,7 @@ int main(int argc, char **argv) {
   // If LLVM_LINK_DYLIB is ON, the single shared library will be returned
   // for "--libs", etc, if they exist. This behaviour can be overridden with
   // --link-static or --link-shared.
-  bool LinkDyLib = (std::strcmp(LLVM_LINK_DYLIB, "ON") == 0);
+  bool LinkDyLib = !!LLVM_LINK_DYLIB;
 
   if (BuiltDyLib) {
     std::string path((SharedDir + DirSep + DyLibName).str());
@@ -549,9 +549,9 @@ int main(int argc, char **argv) {
       } else if (Arg == "--build-system") {
         OS << LLVM_BUILD_SYSTEM << '\n';
       } else if (Arg == "--has-rtti") {
-        OS << LLVM_HAS_RTTI << '\n';
+        OS << (LLVM_HAS_RTTI ? "YES" : "NO") << '\n';
       } else if (Arg == "--has-global-isel") {
-        OS << LLVM_HAS_GLOBAL_ISEL << '\n';
+        OS << (LLVM_HAS_GLOBAL_ISEL ? "ON" : "OFF") << '\n';
       } else if (Arg == "--shared-mode") {
         PrintSharedMode = true;
       } else if (Arg == "--obj-root") {
diff --git a/tools/llvm-pdbdump/CMakeLists.txt b/tools/llvm-pdbdump/CMakeLists.txt
index d929313903da..7c46171941f7 100644
--- a/tools/llvm-pdbdump/CMakeLists.txt
+++ b/tools/llvm-pdbdump/CMakeLists.txt
@@ -8,20 +8,20 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llvm-pdbdump
   llvm-pdbdump.cpp
-  BuiltinDumper.cpp
-  ClassDefinitionDumper.cpp
   YamlSymbolDumper.cpp
   YamlTypeDumper.cpp
-  CompilandDumper.cpp
-  EnumDumper.cpp
-  ExternalSymbolDumper.cpp
-  FunctionDumper.cpp
   LinePrinter.cpp
   LLVMOutputStyle.cpp
   PdbYaml.cpp
-  TypeDumper.cpp
-  TypedefDumper.cpp
-  VariableDumper.cpp
+  PrettyBuiltinDumper.cpp
+  PrettyClassDefinitionDumper.cpp
+  PrettyCompilandDumper.cpp
+  PrettyEnumDumper.cpp
+  PrettyExternalSymbolDumper.cpp
+  PrettyFunctionDumper.cpp
+  PrettyTypeDumper.cpp
+  PrettyTypedefDumper.cpp
+  PrettyVariableDumper.cpp
   YAMLOutputStyle.cpp
   )
 
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 98c67ec9ef3b..629ba40b113c 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -10,9 +10,15 @@
 #include "LLVMOutputStyle.h"
 
 #include "llvm-pdbdump.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
@@ -83,8 +89,7 @@ static void printSectionOffset(llvm::raw_ostream &OS,
   OS << Off.Off << ", " << Off.Isect;
 }
 
-LLVMOutputStyle::LLVMOutputStyle(PDBFile &File)
-    : File(File), P(outs()), Dumper(&P, false) {}
+LLVMOutputStyle::LLVMOutputStyle(PDBFile &File) : File(File), P(outs()) {}
 
 Error LLVMOutputStyle::dump() {
   if (auto EC = dumpFileHeaders())
@@ -519,6 +524,7 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
   if (!Tpi)
     return Tpi.takeError();
 
+  CVTypeDumper Dumper(TypeDB);
   if (DumpRecords || DumpRecordBytes) {
     DictScope D(P, Label);
 
@@ -532,7 +538,8 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
       DictScope DD(P, "");
 
       if (DumpRecords) {
-        if (auto EC = Dumper.dump(Type))
+        TypeDumpVisitor TDV(TypeDB, &P, false);
+        if (auto EC = Dumper.dump(Type, TDV))
           return EC;
       }
 
@@ -545,19 +552,23 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
                                   "TPI stream contained corrupt record");
   } else if (opts::raw::DumpModuleSyms) {
     // Even if the user doesn't want to dump type records, we still need to
-    // iterate them in order to build the list of types so that we can print
-    // them when dumping module symbols. So when they want to dump symbols
-    // but not types, use a null output stream.
-    ScopedPrinter *OldP = Dumper.getPrinter();
-    Dumper.setPrinter(nullptr);
+    // iterate them in order to build the type database. So when they want to
+    // dump symbols but not types, don't stick a dumper on the end, just build
+    // the type database.
+    TypeDatabaseVisitor DBV(TypeDB);
+    TypeDeserializer Deserializer;
+    TypeVisitorCallbackPipeline Pipeline;
+    Pipeline.addCallbackToPipeline(Deserializer);
+    Pipeline.addCallbackToPipeline(DBV);
+
+    CVTypeVisitor Visitor(Pipeline);
 
     bool HadError = false;
-    for (auto &Type : Tpi->types(&HadError)) {
-      if (auto EC = Dumper.dump(Type))
+    for (auto Type : Tpi->types(&HadError)) {
+      if (auto EC = Visitor.visitTypeRecord(Type))
         return EC;
     }
 
-    Dumper.setPrinter(OldP);
     dumpTpiHash(P, *Tpi);
     if (HadError)
       return make_error<RawError>(raw_error_code::corrupt_file,
@@ -640,7 +651,7 @@ Error LLVMOutputStyle::dumpDbiStream() {
 
         if (ShouldDumpSymbols) {
           ListScope SS(P, "Symbols");
-          codeview::CVSymbolDumper SD(P, Dumper, nullptr, false);
+          codeview::CVSymbolDumper SD(P, TypeDB, nullptr, false);
           bool HadError = false;
           for (auto S : ModS.symbols(&HadError)) {
             DictScope LL(P, "");
@@ -865,7 +876,7 @@ Error LLVMOutputStyle::dumpPublicsStream() {
   P.printList("Section Offsets", Publics->getSectionOffsets(),
               printSectionOffset);
   ListScope L(P, "Symbols");
-  codeview::CVSymbolDumper SD(P, Dumper, nullptr, false);
+  codeview::CVSymbolDumper SD(P, TypeDB, nullptr, false);
   bool HadError = false;
   for (auto S : Publics->getSymbols(&HadError)) {
     DictScope DD(P, "");
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.h b/tools/llvm-pdbdump/LLVMOutputStyle.h
index 72a3fd4aba5c..816d591f08f8 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.h
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.h
@@ -12,7 +12,7 @@
 
 #include "OutputStyle.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
@@ -49,7 +49,7 @@ private:
 
   PDBFile &File;
   ScopedPrinter P;
-  codeview::CVTypeDumper Dumper;
+  codeview::TypeDatabase TypeDB;
   std::vector<std::string> StreamPurposes;
 };
 }
diff --git a/tools/llvm-pdbdump/BuiltinDumper.cpp b/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp
index 2ce1a7839110..f866132aa886 100644
--- a/tools/llvm-pdbdump/BuiltinDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp
@@ -1,4 +1,4 @@
-//===- BuiltinDumper.cpp ---------------------------------------- *- C++ *-===//
+//===- PrettyBuiltinDumper.cpp ---------------------------------- *- C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BuiltinDumper.h"
+#include "PrettyBuiltinDumper.h"
 #include "LinePrinter.h"
 #include "llvm-pdbdump.h"
 
diff --git a/tools/llvm-pdbdump/BuiltinDumper.h b/tools/llvm-pdbdump/PrettyBuiltinDumper.h
index 7a2f1438669c..fb6b0b172e6e 100644
--- a/tools/llvm-pdbdump/BuiltinDumper.h
+++ b/tools/llvm-pdbdump/PrettyBuiltinDumper.h
@@ -1,4 +1,4 @@
-//===- BuiltinDumper.h ---------------------------------------- *- C++ --*-===//
+//===- PrettyBuiltinDumper.h ---------------------------------- *- C++ --*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_BUILTINDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_BUILTINDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYBUILTINDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYBUILTINDUMPER_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
diff --git a/tools/llvm-pdbdump/ClassDefinitionDumper.cpp b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp
index 553bc0b267c2..b0c534f7c5b1 100644
--- a/tools/llvm-pdbdump/ClassDefinitionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp
@@ -1,4 +1,4 @@
-//===- ClassDefinitionDumper.cpp --------------------------------*- C++ -*-===//
+//===- PrettyClassDefinitionDumper.cpp --------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ClassDefinitionDumper.h"
-#include "EnumDumper.h"
-#include "FunctionDumper.h"
+#include "PrettyClassDefinitionDumper.h"
+
 #include "LinePrinter.h"
+#include "PrettyEnumDumper.h"
+#include "PrettyFunctionDumper.h"
+#include "PrettyTypedefDumper.h"
+#include "PrettyVariableDumper.h"
 #include "llvm-pdbdump.h"
-#include "TypedefDumper.h"
-#include "VariableDumper.h"
 
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
diff --git a/tools/llvm-pdbdump/ClassDefinitionDumper.h b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h
index 304e11dcb6c9..0831f47557ed 100644
--- a/tools/llvm-pdbdump/ClassDefinitionDumper.h
+++ b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h
@@ -1,4 +1,4 @@
-//===- ClassDefinitionDumper.h - --------------------------------*- C++ -*-===//
+//===- PrettyClassDefinitionDumper.h ----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_CLASSDEFINITIONDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_CLASSDEFINITIONDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYCLASSDEFINITIONDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYCLASSDEFINITIONDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 
 #include <list>
 #include <memory>
diff --git a/tools/llvm-pdbdump/CompilandDumper.cpp b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
index 05141818660e..6257313e3e1a 100644
--- a/tools/llvm-pdbdump/CompilandDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
@@ -1,4 +1,4 @@
-//===- CompilandDumper.cpp - llvm-pdbdump compiland symbol dumper *- C++ *-===//
+//===- PrettyCompilandDumper.cpp - llvm-pdbdump compiland dumper -*- C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CompilandDumper.h"
+#include "PrettyCompilandDumper.h"
+
 #include "LinePrinter.h"
+#include "PrettyFunctionDumper.h"
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
@@ -30,8 +32,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include "FunctionDumper.h"
-
 #include <utility>
 
 using namespace llvm;
diff --git a/tools/llvm-pdbdump/CompilandDumper.h b/tools/llvm-pdbdump/PrettyCompilandDumper.h
index 462aaeb2611f..2127e7d1f529 100644
--- a/tools/llvm-pdbdump/CompilandDumper.h
+++ b/tools/llvm-pdbdump/PrettyCompilandDumper.h
@@ -1,4 +1,4 @@
-//===- CompilandDumper.h - llvm-pdbdump compiland symbol dumper *- C++ --*-===//
+//===- PrettyCompilandDumper.h - llvm-pdbdump compiland dumper -*- C++ --*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_COMPILANDDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_COMPILANDDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYCOMPILANDDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYCOMPILANDDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/EnumDumper.cpp b/tools/llvm-pdbdump/PrettyEnumDumper.cpp
index 43b6018ffedf..965ca1b9f989 100644
--- a/tools/llvm-pdbdump/EnumDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyEnumDumper.cpp
@@ -1,4 +1,4 @@
-//===- EnumDumper.cpp -------------------------------------------*- C++ -*-===//
+//===- PrettyEnumDumper.cpp -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "EnumDumper.h"
+#include "PrettyEnumDumper.h"
 
-#include "BuiltinDumper.h"
 #include "LinePrinter.h"
+#include "PrettyBuiltinDumper.h"
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
diff --git a/tools/llvm-pdbdump/EnumDumper.h b/tools/llvm-pdbdump/PrettyEnumDumper.h
index 0a34e1f89ada..c6e65a6d1772 100644
--- a/tools/llvm-pdbdump/EnumDumper.h
+++ b/tools/llvm-pdbdump/PrettyEnumDumper.h
@@ -1,4 +1,4 @@
-//===- EnumDumper.h - -------------------------------------------*- C++ -*-===//
+//===- PrettyEnumDumper.h ---------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_ENUMDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_ENUMDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYENUMDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYENUMDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/ExternalSymbolDumper.cpp b/tools/llvm-pdbdump/PrettyExternalSymbolDumper.cpp
index 508a2405772e..fc40d90cee96 100644
--- a/tools/llvm-pdbdump/ExternalSymbolDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyExternalSymbolDumper.cpp
@@ -1,4 +1,4 @@
-//===- ExternalSymbolDumper.cpp -------------------------------- *- C++ *-===//
+//===- PrettyExternalSymbolDumper.cpp -------------------------- *- C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ExternalSymbolDumper.h"
+#include "PrettyExternalSymbolDumper.h"
 #include "LinePrinter.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
diff --git a/tools/llvm-pdbdump/ExternalSymbolDumper.h b/tools/llvm-pdbdump/PrettyExternalSymbolDumper.h
index b44b8a6fe98a..6a009862ddd4 100644
--- a/tools/llvm-pdbdump/ExternalSymbolDumper.h
+++ b/tools/llvm-pdbdump/PrettyExternalSymbolDumper.h
@@ -1,4 +1,4 @@
-//===- ExternalSymbolDumper.h --------------------------------- *- C++ --*-===//
+//===- PrettyExternalSymbolDumper.h --------------------------- *- C++ --*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_EXTERNALSYMBOLDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_EXTERNALSYMBOLDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYEXTERNALSYMBOLDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYEXTERNALSYMBOLDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/FunctionDumper.cpp b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
index 29ba15d521f0..2f6ca894fadf 100644
--- a/tools/llvm-pdbdump/FunctionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
@@ -1,4 +1,4 @@
-//===- FunctionDumper.cpp ------------------------------------ *- C++ *-===//
+//===- PrettyFunctionDumper.cpp --------------------------------- *- C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FunctionDumper.h"
-#include "BuiltinDumper.h"
+#include "PrettyFunctionDumper.h"
 #include "LinePrinter.h"
+#include "PrettyBuiltinDumper.h"
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
diff --git a/tools/llvm-pdbdump/FunctionDumper.h b/tools/llvm-pdbdump/PrettyFunctionDumper.h
index c71fafa18ed3..1a6f5430ec5a 100644
--- a/tools/llvm-pdbdump/FunctionDumper.h
+++ b/tools/llvm-pdbdump/PrettyFunctionDumper.h
@@ -1,4 +1,4 @@
-//===- FunctionDumper.h --------------------------------------- *- C++ --*-===//
+//===- PrettyFunctionDumper.h --------------------------------- *- C++ --*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_FUNCTIONDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_FUNCTIONDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYFUNCTIONDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYFUNCTIONDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/TypeDumper.cpp b/tools/llvm-pdbdump/PrettyTypeDumper.cpp
index a49d64045553..4f70c8047337 100644
--- a/tools/llvm-pdbdump/TypeDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyTypeDumper.cpp
@@ -1,4 +1,4 @@
-//===- TypeDumper.cpp - PDBSymDumper implementation for types *----- C++ *-===//
+//===- PrettyTypeDumper.cpp - PDBSymDumper type dumper *------------ C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "TypeDumper.h"
+#include "PrettyTypeDumper.h"
 
-#include "BuiltinDumper.h"
-#include "ClassDefinitionDumper.h"
-#include "EnumDumper.h"
 #include "LinePrinter.h"
+#include "PrettyBuiltinDumper.h"
+#include "PrettyClassDefinitionDumper.h"
+#include "PrettyEnumDumper.h"
+#include "PrettyTypedefDumper.h"
 #include "llvm-pdbdump.h"
-#include "TypedefDumper.h"
 
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
diff --git a/tools/llvm-pdbdump/TypeDumper.h b/tools/llvm-pdbdump/PrettyTypeDumper.h
index 76a477964f1f..f9d8304c3208 100644
--- a/tools/llvm-pdbdump/TypeDumper.h
+++ b/tools/llvm-pdbdump/PrettyTypeDumper.h
@@ -1,4 +1,4 @@
-//===- TypeDumper.h - PDBSymDumper implementation for types *- C++ ------*-===//
+//===- PrettyTypeDumper.h - PDBSymDumper implementation for types *- C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_TYPEDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_TYPEDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYTYPEDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYTYPEDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/TypedefDumper.cpp b/tools/llvm-pdbdump/PrettyTypedefDumper.cpp
index b1e017613ce1..c458755cb780 100644
--- a/tools/llvm-pdbdump/TypedefDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyTypedefDumper.cpp
@@ -1,4 +1,4 @@
-//===- TypedefDumper.cpp - PDBSymDumper impl for typedefs -------- * C++ *-===//
+//===- PrettyTypedefDumper.cpp - PDBSymDumper impl for typedefs -- * C++ *-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "TypedefDumper.h"
+#include "PrettyTypedefDumper.h"
 
-#include "BuiltinDumper.h"
-#include "FunctionDumper.h"
 #include "LinePrinter.h"
+#include "PrettyBuiltinDumper.h"
+#include "PrettyFunctionDumper.h"
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
diff --git a/tools/llvm-pdbdump/TypedefDumper.h b/tools/llvm-pdbdump/PrettyTypedefDumper.h
index c22b58a7e41e..34c139601301 100644
--- a/tools/llvm-pdbdump/TypedefDumper.h
+++ b/tools/llvm-pdbdump/PrettyTypedefDumper.h
@@ -1,4 +1,4 @@
-//===- TypedefDumper.h - llvm-pdbdump typedef dumper ---------*- C++ ----*-===//
+//===- PrettyTypedefDumper.h - llvm-pdbdump typedef dumper ---*- C++ ----*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_TYPEDEFDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_TYPEDEFDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYTYPEDEFDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYTYPEDEFDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/VariableDumper.cpp b/tools/llvm-pdbdump/PrettyVariableDumper.cpp
index 284d7e9b731f..e1469186ad8b 100644
--- a/tools/llvm-pdbdump/VariableDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyVariableDumper.cpp
@@ -1,4 +1,4 @@
-//===- VariableDumper.cpp - -------------------------------------*- C++ -*-===//
+//===- PrettyVariableDumper.cpp ---------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "VariableDumper.h"
+#include "PrettyVariableDumper.h"
 
-#include "BuiltinDumper.h"
 #include "LinePrinter.h"
+#include "PrettyBuiltinDumper.h"
+#include "PrettyFunctionDumper.h"
 #include "llvm-pdbdump.h"
-#include "FunctionDumper.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
 #include "llvm/Support/Format.h"
diff --git a/tools/llvm-pdbdump/VariableDumper.h b/tools/llvm-pdbdump/PrettyVariableDumper.h
index 4f00358878c9..a122bb86058c 100644
--- a/tools/llvm-pdbdump/VariableDumper.h
+++ b/tools/llvm-pdbdump/PrettyVariableDumper.h
@@ -1,4 +1,4 @@
-//===- VariableDumper.h - PDBSymDumper implementation for types -*- C++ -*-===//
+//===- PrettyVariableDumper.h - PDBSymDumper variable dumper ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVMPDBDUMP_VARIABLEDUMPER_H
-#define LLVM_TOOLS_LLVMPDBDUMP_VARIABLEDUMPER_H
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYVARIABLEDUMPER_H
+#define LLVM_TOOLS_LLVMPDBDUMP_PRETTYVARIABLEDUMPER_H
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.h b/tools/llvm-pdbdump/YAMLOutputStyle.h
index 540dee4121e6..3cd603a95b6a 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.h
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.h
@@ -13,7 +13,7 @@
 #include "OutputStyle.h"
 #include "PdbYaml.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index b356a28d2189..d3495e524abc 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-pdbdump.h"
-#include "CompilandDumper.h"
-#include "ExternalSymbolDumper.h"
-#include "FunctionDumper.h"
 #include "LLVMOutputStyle.h"
 #include "LinePrinter.h"
 #include "OutputStyle.h"
-#include "TypeDumper.h"
-#include "VariableDumper.h"
+#include "PrettyCompilandDumper.h"
+#include "PrettyExternalSymbolDumper.h"
+#include "PrettyFunctionDumper.h"
+#include "PrettyTypeDumper.h"
+#include "PrettyVariableDumper.h"
 #include "YAMLOutputStyle.h"
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 0ca186519cd2..c83655fe4d22 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
@@ -29,7 +30,7 @@
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
@@ -64,8 +65,7 @@ class COFFDumper : public ObjDumper {
 public:
   friend class COFFObjectDumpDelegate;
   COFFDumper(const llvm::object::COFFObjectFile *Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj),
-        CVTD(&Writer, opts::CodeViewSubsectionBytes) {}
+      : ObjDumper(Writer), Obj(Obj), Writer(Writer) {}
 
   void printFileHeaders() override;
   void printSections() override;
@@ -99,7 +99,7 @@ private:
   void printFileNameForOffset(StringRef Label, uint32_t FileOffset);
   void printTypeIndex(StringRef FieldName, TypeIndex TI) {
     // Forward to CVTypeDumper for simplicity.
-    CVTD.printTypeIndex(FieldName, TI);
+    CVTypeDumper::printTypeIndex(Writer, FieldName, TI, TypeDB);
   }
 
   void printCodeViewSymbolsSubsection(StringRef Subsection,
@@ -142,7 +142,8 @@ private:
   StringRef CVFileChecksumTable;
   StringRef CVStringTable;
 
-  CVTypeDumper CVTD;
+  ScopedPrinter &Writer;
+  TypeDatabase TypeDB;
 };
 
 class COFFObjectDumpDelegate : public SymbolDumpDelegate {
@@ -962,7 +963,8 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
   auto CODD = llvm::make_unique<COFFObjectDumpDelegate>(*this, Section, Obj,
                                                         SectionContents);
 
-  CVSymbolDumper CVSD(W, CVTD, std::move(CODD), opts::CodeViewSubsectionBytes);
+  CVSymbolDumper CVSD(W, TypeDB, std::move(CODD),
+                      opts::CodeViewSubsectionBytes);
   ByteStream Stream(BinaryData);
   CVSymbolArray Symbols;
   StreamReader Reader(Stream);
@@ -1106,7 +1108,9 @@ void COFFDumper::printCodeViewTypeSection(StringRef SectionName,
   if (Magic != COFF::DEBUG_SECTION_MAGIC)
     return error(object_error::parse_failed);
 
-  if (auto EC = CVTD.dump({Data.bytes_begin(), Data.bytes_end()})) {
+  CVTypeDumper CVTD(TypeDB);
+  TypeDumpVisitor TDV(TypeDB, &W, opts::CodeViewSubsectionBytes);
+  if (auto EC = CVTD.dump({Data.bytes_begin(), Data.bytes_end()}, TDV)) {
     W.flush();
     error(llvm::errorToErrorCode(std::move(EC)));
   }
@@ -1552,8 +1556,12 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
   CVTypes.ForEachRecord([&](TypeIndex TI, ArrayRef<uint8_t> Record) {
     Buf.append(Record.begin(), Record.end());
   });
-  CVTypeDumper CVTD(&Writer, opts::CodeViewSubsectionBytes);
-  if (auto EC = CVTD.dump({Buf.str().bytes_begin(), Buf.str().bytes_end()})) {
+
+  TypeDatabase TypeDB;
+  CVTypeDumper CVTD(TypeDB);
+  TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
+  if (auto EC =
+          CVTD.dump({Buf.str().bytes_begin(), Buf.str().bytes_end()}, TDV)) {
     Writer.flush();
     error(llvm::errorToErrorCode(std::move(EC)));
   }
diff --git a/tools/llvm-xray/CMakeLists.txt b/tools/llvm-xray/CMakeLists.txt
index 0084e35c1b0b..abcd7d932110 100644
--- a/tools/llvm-xray/CMakeLists.txt
+++ b/tools/llvm-xray/CMakeLists.txt
@@ -1,9 +1,16 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  DebugInfoDWARF
+  Object
   Support
-  Object)
+  Symbolize
+  XRay)
 
 set(LLVM_XRAY_TOOLS
+  func-id-helper.cc
+  xray-account.cc
+  xray-converter.cc
+  xray-extract.cc
   xray-extract.cc
   xray-registry.cc)
 
diff --git a/tools/llvm-xray/func-id-helper.cc b/tools/llvm-xray/func-id-helper.cc
new file mode 100644
index 000000000000..3234010695b2
--- /dev/null
+++ b/tools/llvm-xray/func-id-helper.cc
@@ -0,0 +1,60 @@
+//===- xray-fc-account.cc - XRay Function Call Accounting Tool ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the helper tools dealing with XRay-generated function ids.
+//
+//===----------------------------------------------------------------------===//
+
+#include "func-id-helper.h"
+#include "llvm/Support/Path.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace xray;
+
+std::string FuncIdConversionHelper::SymbolOrNumber(int32_t FuncId) const {
+  std::ostringstream F;
+  auto It = FunctionAddresses.find(FuncId);
+  if (It == FunctionAddresses.end()) {
+    F << "#" << FuncId;
+    return F.str();
+  }
+
+  if (auto ResOrErr = Symbolizer.symbolizeCode(BinaryInstrMap, It->second)) {
+    auto &DI = *ResOrErr;
+    if (DI.FunctionName == "<invalid>")
+      F << "@(" << std::hex << It->second << ")";
+    else
+      F << DI.FunctionName;
+  } else
+    handleAllErrors(ResOrErr.takeError(), [&](const ErrorInfoBase &) {
+      F << "@(" << std::hex << It->second << ")";
+    });
+
+  return F.str();
+}
+
+std::string FuncIdConversionHelper::FileLineAndColumn(int32_t FuncId) const {
+  auto It = FunctionAddresses.find(FuncId);
+  if (It == FunctionAddresses.end())
+    return "(unknown)";
+
+  std::ostringstream F;
+  auto ResOrErr = Symbolizer.symbolizeCode(BinaryInstrMap, It->second);
+  if (!ResOrErr) {
+    consumeError(ResOrErr.takeError());
+    return "(unknown)";
+  }
+
+  auto &DI = *ResOrErr;
+  F << sys::path::filename(DI.FileName).str() << ":" << DI.Line << ":"
+    << DI.Column;
+
+  return F.str();
+}
diff --git a/tools/llvm-xray/func-id-helper.h b/tools/llvm-xray/func-id-helper.h
new file mode 100644
index 000000000000..7348a7100b05
--- /dev/null
+++ b/tools/llvm-xray/func-id-helper.h
@@ -0,0 +1,49 @@
+//===- func-id-helper.h - XRay Function ID Conversion Helpers -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines helper tools dealing with XRay-generated function ids.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
+#define LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
+
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include <unordered_map>
+
+namespace llvm {
+namespace xray {
+
+// This class consolidates common operations related to Function IDs.
+class FuncIdConversionHelper {
+public:
+  using FunctionAddressMap = std::unordered_map<int32_t, uint64_t>;
+
+private:
+  std::string BinaryInstrMap;
+  symbolize::LLVMSymbolizer &Symbolizer;
+  const FunctionAddressMap &FunctionAddresses;
+
+public:
+  FuncIdConversionHelper(std::string BinaryInstrMap,
+                         symbolize::LLVMSymbolizer &Symbolizer,
+                         const FunctionAddressMap &FunctionAddresses)
+      : BinaryInstrMap(std::move(BinaryInstrMap)), Symbolizer(Symbolizer),
+        FunctionAddresses(FunctionAddresses) {}
+
+  // Returns the symbol or a string representation of the function id.
+  std::string SymbolOrNumber(int32_t FuncId) const;
+
+  // Returns the file and column from debug info for the given function id.
+  std::string FileLineAndColumn(int32_t FuncId) const;
+};
+
+} // namespace xray
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
diff --git a/tools/llvm-xray/xray-account.cc b/tools/llvm-xray/xray-account.cc
new file mode 100644
index 000000000000..671a5a073eec
--- /dev/null
+++ b/tools/llvm-xray/xray-account.cc
@@ -0,0 +1,485 @@
+//===- xray-account.h - XRay Function Call Accounting ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic function call accounting from an XRay trace.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <system_error>
+#include <utility>
+
+#include "xray-account.h"
+#include "xray-extract.h"
+#include "xray-registry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/XRay/Trace.h"
+
+using namespace llvm;
+using namespace llvm::xray;
+
+static cl::SubCommand Account("account", "Function call accounting");
+static cl::opt<std::string> AccountInput(cl::Positional,
+                                         cl::desc("<xray log file>"),
+                                         cl::Required, cl::sub(Account));
+static cl::opt<bool>
+    AccountKeepGoing("keep-going", cl::desc("Keep going on errors encountered"),
+                     cl::sub(Account), cl::init(false));
+static cl::alias AccountKeepGoing2("k", cl::aliasopt(AccountKeepGoing),
+                                   cl::desc("Alias for -keep_going"),
+                                   cl::sub(Account));
+static cl::opt<bool> AccountDeduceSiblingCalls(
+    "deduce-sibling-calls",
+    cl::desc("Deduce sibling calls when unrolling function call stacks"),
+    cl::sub(Account), cl::init(false));
+static cl::alias
+    AccountDeduceSiblingCalls2("d", cl::aliasopt(AccountDeduceSiblingCalls),
+                               cl::desc("Alias for -deduce_sibling_calls"),
+                               cl::sub(Account));
+static cl::opt<std::string>
+    AccountOutput("output", cl::value_desc("output file"), cl::init("-"),
+                  cl::desc("output file; use '-' for stdout"),
+                  cl::sub(Account));
+static cl::alias AccountOutput2("o", cl::aliasopt(AccountOutput),
+                                cl::desc("Alias for -output"),
+                                cl::sub(Account));
+enum class AccountOutputFormats { TEXT, CSV };
+static cl::opt<AccountOutputFormats>
+    AccountOutputFormat("format", cl::desc("output format"),
+                        cl::values(clEnumValN(AccountOutputFormats::TEXT,
+                                              "text", "report stats in text"),
+                                   clEnumValN(AccountOutputFormats::CSV, "csv",
+                                              "report stats in csv")),
+                        cl::sub(Account));
+static cl::alias AccountOutputFormat2("f", cl::desc("Alias of -format"),
+                                      cl::aliasopt(AccountOutputFormat),
+                                      cl::sub(Account));
+
+enum class SortField {
+  FUNCID,
+  COUNT,
+  MIN,
+  MED,
+  PCT90,
+  PCT99,
+  MAX,
+  SUM,
+  FUNC,
+};
+
+static cl::opt<SortField> AccountSortOutput(
+    "sort", cl::desc("sort output by this field"), cl::value_desc("field"),
+    cl::sub(Account), cl::init(SortField::FUNCID),
+    cl::values(clEnumValN(SortField::FUNCID, "funcid", "function id"),
+               clEnumValN(SortField::COUNT, "count", "funciton call counts"),
+               clEnumValN(SortField::MIN, "min", "minimum function durations"),
+               clEnumValN(SortField::MED, "med", "median function durations"),
+               clEnumValN(SortField::PCT90, "90p", "90th percentile durations"),
+               clEnumValN(SortField::PCT99, "99p", "99th percentile durations"),
+               clEnumValN(SortField::MAX, "max", "maximum function durations"),
+               clEnumValN(SortField::SUM, "sum", "sum of call durations"),
+               clEnumValN(SortField::FUNC, "func", "function names")));
+static cl::alias AccountSortOutput2("s", cl::aliasopt(AccountSortOutput),
+                                    cl::desc("Alias for -sort"),
+                                    cl::sub(Account));
+
+enum class SortDirection {
+  ASCENDING,
+  DESCENDING,
+};
+static cl::opt<SortDirection> AccountSortOrder(
+    "sortorder", cl::desc("sort ordering"), cl::init(SortDirection::ASCENDING),
+    cl::values(clEnumValN(SortDirection::ASCENDING, "asc", "ascending"),
+               clEnumValN(SortDirection::DESCENDING, "dsc", "descending")),
+    cl::sub(Account));
+static cl::alias AccountSortOrder2("r", cl::aliasopt(AccountSortOrder),
+                                   cl::desc("Alias for -sortorder"),
+                                   cl::sub(Account));
+
+static cl::opt<int> AccountTop("top", cl::desc("only show the top N results"),
+                               cl::value_desc("N"), cl::sub(Account),
+                               cl::init(-1));
+static cl::alias AccountTop2("p", cl::desc("Alias for -top"),
+                             cl::aliasopt(AccountTop), cl::sub(Account));
+
+static cl::opt<std::string>
+    AccountInstrMap("instr_map",
+                    cl::desc("binary with the instrumentation map, or "
+                             "a separate instrumentation map"),
+                    cl::value_desc("binary with xray_instr_map"),
+                    cl::sub(Account), cl::init(""));
+static cl::alias AccountInstrMap2("m", cl::aliasopt(AccountInstrMap),
+                                  cl::desc("Alias for -instr_map"),
+                                  cl::sub(Account));
+static cl::opt<InstrumentationMapExtractor::InputFormats> InstrMapFormat(
+    "instr-map-format", cl::desc("format of instrumentation map"),
+    cl::values(clEnumValN(InstrumentationMapExtractor::InputFormats::ELF, "elf",
+                          "instrumentation map in an ELF header"),
+               clEnumValN(InstrumentationMapExtractor::InputFormats::YAML,
+                          "yaml", "instrumentation map in YAML")),
+    cl::sub(Account), cl::init(InstrumentationMapExtractor::InputFormats::ELF));
+static cl::alias InstrMapFormat2("t", cl::aliasopt(InstrMapFormat),
+                                 cl::desc("Alias for -instr-map-format"),
+                                 cl::sub(Account));
+
+namespace {
+
+template <class T, class U> void setMinMax(std::pair<T, T> &MM, U &&V) {
+  if (MM.first == 0 || MM.second == 0)
+    MM = std::make_pair(std::forward<U>(V), std::forward<U>(V));
+  else
+    MM = std::make_pair(std::min(MM.first, V), std::max(MM.second, V));
+}
+
+template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
+
+} // namespace
+
+bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
+  setMinMax(PerThreadMinMaxTSC[Record.TId], Record.TSC);
+  setMinMax(PerCPUMinMaxTSC[Record.CPU], Record.TSC);
+
+  if (CurrentMaxTSC == 0)
+    CurrentMaxTSC = Record.TSC;
+
+  if (Record.TSC < CurrentMaxTSC)
+    return false;
+
+  auto &ThreadStack = PerThreadFunctionStack[Record.TId];
+  switch (Record.Type) {
+  case RecordTypes::ENTER: {
+    // Function Enter
+    ThreadStack.emplace_back(Record.FuncId, Record.TSC);
+    break;
+  }
+  case RecordTypes::EXIT: {
+    // Function Exit
+    if (ThreadStack.back().first == Record.FuncId) {
+      const auto &Top = ThreadStack.back();
+      recordLatency(Top.first, diff(Top.second, Record.TSC));
+      ThreadStack.pop_back();
+      break;
+    }
+
+    if (!DeduceSiblingCalls)
+      return false;
+
+    // Look for the parent up the stack.
+    auto Parent =
+        std::find_if(ThreadStack.rbegin(), ThreadStack.rend(),
+                     [&](const std::pair<const int32_t, uint64_t> &E) {
+                       return E.first == Record.FuncId;
+                     });
+    if (Parent == ThreadStack.rend())
+      return false;
+
+    // Account time for this apparently sibling call exit up the stack.
+    // Considering the following case:
+    //
+    //   f()
+    //    g()
+    //      h()
+    //
+    // We might only ever see the following entries:
+    //
+    //   -> f()
+    //   -> g()
+    //   -> h()
+    //   <- h()
+    //   <- f()
+    //
+    // Now we don't see the exit to g() because some older version of the XRay
+    // runtime wasn't instrumenting tail exits. If we don't deduce tail calls,
+    // we may potentially never account time for g() -- and this code would have
+    // already bailed out, because `<- f()` doesn't match the current "top" of
+    // stack where we're waiting for the exit to `g()` instead. This is not
+    // ideal and brittle -- so instead we provide a potentially inaccurate
+    // accounting of g() instead, computing it from the exit of f().
+    //
+    // While it might be better that we account the time between `-> g()` and
+    // `-> h()` as the proper accounting of time for g() here, this introduces
+    // complexity to do correctly (need to backtrack, etc.).
+    //
+    // FIXME: Potentially implement the more complex deduction algorithm?
+    auto I = std::next(Parent).base();
+    for (auto &E : make_range(I, ThreadStack.end())) {
+      recordLatency(E.first, diff(E.second, Record.TSC));
+    }
+    ThreadStack.erase(I, ThreadStack.end());
+    break;
+  }
+  }
+
+  return true;
+}
+
+namespace {
+
+// We consolidate the data into a struct which we can output in various forms.
+struct ResultRow {
+  uint64_t Count;
+  double Min;
+  double Median;
+  double Pct90;
+  double Pct99;
+  double Max;
+  double Sum;
+  std::string DebugInfo;
+  std::string Function;
+};
+
+ResultRow getStats(std::vector<uint64_t> &Timings) {
+  assert(!Timings.empty());
+  ResultRow R;
+  R.Sum = std::accumulate(Timings.begin(), Timings.end(), 0.0);
+  auto MinMax = std::minmax_element(Timings.begin(), Timings.end());
+  R.Min = *MinMax.first;
+  R.Max = *MinMax.second;
+  auto MedianOff = Timings.size() / 2;
+  std::nth_element(Timings.begin(), Timings.begin() + MedianOff, Timings.end());
+  R.Median = Timings[MedianOff];
+  auto Pct90Off = std::floor(Timings.size() * 0.9);
+  std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end());
+  R.Pct90 = Timings[Pct90Off];
+  auto Pct99Off = std::floor(Timings.size() * 0.99);
+  std::nth_element(Timings.begin(), Timings.begin() + Pct90Off, Timings.end());
+  R.Pct99 = Timings[Pct99Off];
+  R.Count = Timings.size();
+  return R;
+}
+
+} // namespace
+
+template <class F>
+void LatencyAccountant::exportStats(const XRayFileHeader &Header, F Fn) const {
+  using TupleType = std::tuple<int32_t, uint64_t, ResultRow>;
+  std::vector<TupleType> Results;
+  Results.reserve(FunctionLatencies.size());
+  for (auto FT : FunctionLatencies) {
+    const auto &FuncId = FT.first;
+    auto &Timings = FT.second;
+    Results.emplace_back(FuncId, Timings.size(), getStats(Timings));
+    auto &Row = std::get<2>(Results.back());
+    if (Header.CycleFrequency) {
+      double CycleFrequency = Header.CycleFrequency;
+      Row.Min /= CycleFrequency;
+      Row.Median /= CycleFrequency;
+      Row.Pct90 /= CycleFrequency;
+      Row.Pct99 /= CycleFrequency;
+      Row.Max /= CycleFrequency;
+      Row.Sum /= CycleFrequency;
+    }
+
+    Row.Function = FuncIdHelper.SymbolOrNumber(FuncId);
+    Row.DebugInfo = FuncIdHelper.FileLineAndColumn(FuncId);
+  }
+
+  // Sort the data according to user-provided flags.
+  switch (AccountSortOutput) {
+  case SortField::FUNCID:
+    std::sort(Results.begin(), Results.end(),
+              [](const TupleType &L, const TupleType &R) {
+                if (AccountSortOrder == SortDirection::ASCENDING)
+                  return std::get<0>(L) < std::get<0>(R);
+                if (AccountSortOrder == SortDirection::DESCENDING)
+                  return std::get<0>(L) > std::get<0>(R);
+                llvm_unreachable("Unknown sort direction");
+              });
+    break;
+  case SortField::COUNT:
+    std::sort(Results.begin(), Results.end(),
+              [](const TupleType &L, const TupleType &R) {
+                if (AccountSortOrder == SortDirection::ASCENDING)
+                  return std::get<1>(L) < std::get<1>(R);
+                if (AccountSortOrder == SortDirection::DESCENDING)
+                  return std::get<1>(L) > std::get<1>(R);
+                llvm_unreachable("Unknown sort direction");
+              });
+    break;
+  default:
+    // Here we need to look into the ResultRow for the rest of the data that
+    // we want to sort by.
+    std::sort(Results.begin(), Results.end(),
+              [&](const TupleType &L, const TupleType &R) {
+                auto &LR = std::get<2>(L);
+                auto &RR = std::get<2>(R);
+                switch (AccountSortOutput) {
+                case SortField::COUNT:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Count < RR.Count;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Count > RR.Count;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::MIN:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Min < RR.Min;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Min > RR.Min;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::MED:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Median < RR.Median;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Median > RR.Median;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::PCT90:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Pct90 < RR.Pct90;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Pct90 > RR.Pct90;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::PCT99:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Pct99 < RR.Pct99;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Pct99 > RR.Pct99;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::MAX:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Max < RR.Max;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Max > RR.Max;
+                  llvm_unreachable("Unknown sort direction");
+                case SortField::SUM:
+                  if (AccountSortOrder == SortDirection::ASCENDING)
+                    return LR.Sum < RR.Sum;
+                  if (AccountSortOrder == SortDirection::DESCENDING)
+                    return LR.Sum > RR.Sum;
+                  llvm_unreachable("Unknown sort direction");
+                default:
+                  llvm_unreachable("Unsupported sort order");
+                }
+              });
+    break;
+  }
+
+  if (AccountTop > 0)
+    Results.erase(Results.begin() + AccountTop.getValue(), Results.end());
+
+  for (const auto &R : Results)
+    Fn(std::get<0>(R), std::get<1>(R), std::get<2>(R));
+}
+
+void LatencyAccountant::exportStatsAsText(raw_ostream &OS,
+                                          const XRayFileHeader &Header) const {
+  OS << "Functions with latencies: " << FunctionLatencies.size() << "\n";
+
+  // We spend some effort to make the text output more readable, so we do the
+  // following formatting decisions for each of the fields:
+  //
+  //   - funcid: 32-bit, but we can determine the largest number and be
+  //   between
+  //     a minimum of 5 characters, up to 9 characters, right aligned.
+  //   - count:  64-bit, but we can determine the largest number and be
+  //   between
+  //     a minimum of 5 characters, up to 9 characters, right aligned.
+  //   - min, median, 90pct, 99pct, max: double precision, but we want to keep
+  //     the values in seconds, with microsecond precision (0.000'001), so we
+  //     have at most 6 significant digits, with the whole number part to be
+  //     at
+  //     least 1 character. For readability we'll right-align, with full 9
+  //     characters each.
+  //   - debug info, function name: we format this as a concatenation of the
+  //     debug info and the function name.
+  //
+  static constexpr char StatsHeaderFormat[] =
+      "{0,+9} {1,+10} [{2,+9}, {3,+9}, {4,+9}, {5,+9}, {6,+9}] {7,+9}";
+  static constexpr char StatsFormat[] =
+      R"({0,+9} {1,+10} [{2,+9:f6}, {3,+9:f6}, {4,+9:f6}, {5,+9:f6}, {6,+9:f6}] {7,+9:f6})";
+  OS << llvm::formatv(StatsHeaderFormat, "funcid", "count", "min", "med", "90p",
+                      "99p", "max", "sum")
+     << llvm::formatv("  {0,-12}\n", "function");
+  exportStats(Header, [&](int32_t FuncId, size_t Count, const ResultRow &Row) {
+    OS << llvm::formatv(StatsFormat, FuncId, Count, Row.Min, Row.Median,
+                        Row.Pct90, Row.Pct99, Row.Max, Row.Sum)
+       << "  " << Row.DebugInfo << ": " << Row.Function << "\n";
+  });
+}
+
+void LatencyAccountant::exportStatsAsCSV(raw_ostream &OS,
+                                         const XRayFileHeader &Header) const {
+  OS << "funcid,count,min,median,90%ile,99%ile,max,sum,debug,function\n";
+  exportStats(Header, [&](int32_t FuncId, size_t Count, const ResultRow &Row) {
+    OS << FuncId << ',' << Count << ',' << Row.Min << ',' << Row.Median << ','
+       << Row.Pct90 << ',' << Row.Pct99 << ',' << Row.Max << "," << Row.Sum
+       << ",\"" << Row.DebugInfo << "\",\"" << Row.Function << "\"\n";
+  });
+}
+
+using namespace llvm::xray;
+
+static CommandRegistration Unused(&Account, []() -> Error {
+  int Fd;
+  auto EC = sys::fs::openFileForRead(AccountInput, Fd);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Cannot open file '") + AccountInput + "'", EC);
+
+  Error Err = Error::success();
+  xray::InstrumentationMapExtractor Extractor(AccountInstrMap, InstrMapFormat,
+                                              Err);
+  if (auto E = handleErrors(
+        std::move(Err), [&](std::unique_ptr<StringError> SE) -> Error {
+          if (SE->convertToErrorCode() == std::errc::no_such_file_or_directory)
+            return Error::success();
+          return Error(std::move(SE));
+        }))
+    return E;
+
+  raw_fd_ostream OS(AccountOutput, EC, sys::fs::OpenFlags::F_Text);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Cannot open file '") + AccountOutput + "' for writing.", EC);
+
+  const auto &FunctionAddresses = Extractor.getFunctionAddresses();
+  symbolize::LLVMSymbolizer::Options Opts(
+      symbolize::FunctionNameKind::LinkageName, true, true, false, "");
+  symbolize::LLVMSymbolizer Symbolizer(Opts);
+  llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
+                                                  FunctionAddresses);
+  xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls);
+  if (auto TraceOrErr = loadTraceFile(AccountInput)) {
+    auto &T = *TraceOrErr;
+    for (const auto &Record : T) {
+      if (FCA.accountRecord(Record))
+        continue;
+      for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
+        errs() << "Thread ID: " << ThreadStack.first << "\n";
+        auto Level = ThreadStack.second.size();
+        for (const auto &Entry : llvm::reverse(ThreadStack.second))
+          errs() << "#" << Level-- << "\t"
+                 << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n';
+      }
+      if (!AccountKeepGoing)
+        return make_error<StringError>(
+            Twine("Failed accounting function calls in file '") + AccountInput +
+                "'.",
+            std::make_error_code(std::errc::executable_format_error));
+    }
+    switch (AccountOutputFormat) {
+    case AccountOutputFormats::TEXT:
+      FCA.exportStatsAsText(OS, T.getFileHeader());
+      break;
+    case AccountOutputFormats::CSV:
+      FCA.exportStatsAsCSV(OS, T.getFileHeader());
+      break;
+    }
+  } else {
+    return joinErrors(
+        make_error<StringError>(
+            Twine("Failed loading input file '") + AccountInput + "'",
+            std::make_error_code(std::errc::executable_format_error)),
+        TraceOrErr.takeError());
+  }
+
+  return Error::success();
+});
diff --git a/tools/llvm-xray/xray-account.h b/tools/llvm-xray/xray-account.h
new file mode 100644
index 000000000000..cc9ba897e537
--- /dev/null
+++ b/tools/llvm-xray/xray-account.h
@@ -0,0 +1,109 @@
+//===- xray-account.h - XRay Function Call Accounting ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for performing some basic function call
+// accounting from an XRay trace.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_XRAY_XRAY_ACCOUNT_H
+#define LLVM_TOOLS_LLVM_XRAY_XRAY_ACCOUNT_H
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "func-id-helper.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/XRay/XRayRecord.h"
+
+namespace llvm {
+namespace xray {
+
+class LatencyAccountant {
+public:
+  typedef std::map<int32_t, std::vector<uint64_t>> FunctionLatencyMap;
+  typedef std::map<llvm::sys::ProcessInfo::ProcessId,
+                   std::pair<uint64_t, uint64_t>>
+      PerThreadMinMaxTSCMap;
+  typedef std::map<uint8_t, std::pair<uint64_t, uint64_t>> PerCPUMinMaxTSCMap;
+  typedef std::vector<std::pair<int32_t, uint64_t>> FunctionStack;
+  typedef std::map<llvm::sys::ProcessInfo::ProcessId, FunctionStack>
+      PerThreadFunctionStackMap;
+
+private:
+  PerThreadFunctionStackMap PerThreadFunctionStack;
+  FunctionLatencyMap FunctionLatencies;
+  PerThreadMinMaxTSCMap PerThreadMinMaxTSC;
+  PerCPUMinMaxTSCMap PerCPUMinMaxTSC;
+  FuncIdConversionHelper &FuncIdHelper;
+
+  bool DeduceSiblingCalls = false;
+  uint64_t CurrentMaxTSC = 0;
+
+  void recordLatency(int32_t FuncId, uint64_t Latency) {
+    FunctionLatencies[FuncId].push_back(Latency);
+  }
+
+public:
+  explicit LatencyAccountant(FuncIdConversionHelper &FuncIdHelper,
+                             bool DeduceSiblingCalls)
+      : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DeduceSiblingCalls) {}
+
+  const FunctionLatencyMap &getFunctionLatencies() const {
+    return FunctionLatencies;
+  }
+
+  const PerThreadMinMaxTSCMap &getPerThreadMinMaxTSC() const {
+    return PerThreadMinMaxTSC;
+  }
+
+  const PerCPUMinMaxTSCMap &getPerCPUMinMaxTSC() const {
+    return PerCPUMinMaxTSC;
+  }
+
+  /// Returns false in case we fail to account the provided record. This happens
+  /// in the following cases:
+  ///
+  ///   - An exit record does not match any entry records for the same function.
+  ///     If we've been set to deduce sibling calls, we try walking up the stack
+  ///     and recording times for the higher level functions.
+  ///   - A record has a TSC that's before the latest TSC that has been
+  ///     recorded. We still record the TSC for the min-max.
+  ///
+  bool accountRecord(const XRayRecord &Record);
+
+  const FunctionStack *
+  getThreadFunctionStack(llvm::sys::ProcessInfo::ProcessId TId) const {
+    auto I = PerThreadFunctionStack.find(TId);
+    if (I == PerThreadFunctionStack.end())
+      return nullptr;
+    return &I->second;
+  }
+
+  const PerThreadFunctionStackMap &getPerThreadFunctionStack() const {
+    return PerThreadFunctionStack;
+  }
+
+  // Output Functions
+  // ================
+
+  void exportStatsAsText(raw_ostream &OS, const XRayFileHeader &Header) const;
+  void exportStatsAsCSV(raw_ostream &OS, const XRayFileHeader &Header) const;
+
+private:
+  // Internal helper to implement common parts of the exportStatsAs...
+  // functions.
+  template <class F> void exportStats(const XRayFileHeader &Header, F fn) const;
+};
+
+} // namespace xray
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_XRAY_XRAY_ACCOUNT_H
diff --git a/tools/llvm-xray/xray-converter.cc b/tools/llvm-xray/xray-converter.cc
new file mode 100644
index 000000000000..31275e2902f2
--- /dev/null
+++ b/tools/llvm-xray/xray-converter.cc
@@ -0,0 +1,202 @@
+//===- xray-converter.cc - XRay Trace Conversion --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the trace conversion functions.
+//
+//===----------------------------------------------------------------------===//
+#include "xray-converter.h"
+
+#include "xray-extract.h"
+#include "xray-registry.h"
+#include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/XRay/Trace.h"
+#include "llvm/XRay/YAMLXRayRecord.h"
+
+using namespace llvm;
+using namespace xray;
+
+// llvm-xray convert
+// ----------------------------------------------------------------------------
+static cl::SubCommand Convert("convert", "Trace Format Conversion");
+static cl::opt<std::string> ConvertInput(cl::Positional,
+                                         cl::desc("<xray log file>"),
+                                         cl::Required, cl::sub(Convert));
+enum class ConvertFormats { BINARY, YAML };
+static cl::opt<ConvertFormats> ConvertOutputFormat(
+    "output-format", cl::desc("output format"),
+    cl::values(clEnumValN(ConvertFormats::BINARY, "raw", "output in binary"),
+               clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml")),
+    cl::sub(Convert));
+static cl::alias ConvertOutputFormat2("f", cl::aliasopt(ConvertOutputFormat),
+                                      cl::desc("Alias for -output-format"),
+                                      cl::sub(Convert));
+static cl::opt<std::string>
+    ConvertOutput("output", cl::value_desc("output file"), cl::init("-"),
+                  cl::desc("output file; use '-' for stdout"),
+                  cl::sub(Convert));
+static cl::alias ConvertOutput2("o", cl::aliasopt(ConvertOutput),
+                                cl::desc("Alias for -output"),
+                                cl::sub(Convert));
+
+static cl::opt<bool>
+    ConvertSymbolize("symbolize",
+                     cl::desc("symbolize function ids from the input log"),
+                     cl::init(false), cl::sub(Convert));
+static cl::alias ConvertSymbolize2("y", cl::aliasopt(ConvertSymbolize),
+                                   cl::desc("Alias for -symbolize"),
+                                   cl::sub(Convert));
+
+static cl::opt<std::string>
+    ConvertInstrMap("instr_map",
+                    cl::desc("binary with the instrumentation map, or "
+                             "a separate instrumentation map"),
+                    cl::value_desc("binary with xray_instr_map"),
+                    cl::sub(Convert), cl::init(""));
+static cl::alias ConvertInstrMap2("m", cl::aliasopt(ConvertInstrMap),
+                                  cl::desc("Alias for -instr_map"),
+                                  cl::sub(Convert));
+static cl::opt<bool> ConvertSortInput(
+    "sort",
+    cl::desc("determines whether to sort input log records by timestamp"),
+    cl::sub(Convert), cl::init(true));
+static cl::alias ConvertSortInput2("s", cl::aliasopt(ConvertSortInput),
+                                   cl::desc("Alias for -sort"),
+                                   cl::sub(Convert));
+static cl::opt<InstrumentationMapExtractor::InputFormats> InstrMapFormat(
+    "instr-map-format", cl::desc("format of instrumentation map"),
+    cl::values(clEnumValN(InstrumentationMapExtractor::InputFormats::ELF, "elf",
+                          "instrumentation map in an ELF header"),
+               clEnumValN(InstrumentationMapExtractor::InputFormats::YAML,
+                          "yaml", "instrumentation map in YAML")),
+    cl::sub(Convert), cl::init(InstrumentationMapExtractor::InputFormats::ELF));
+static cl::alias InstrMapFormat2("t", cl::aliasopt(InstrMapFormat),
+                                 cl::desc("Alias for -instr-map-format"),
+                                 cl::sub(Convert));
+
+using llvm::yaml::IO;
+using llvm::yaml::Output;
+
+void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
+  YAMLXRayTrace Trace;
+  const auto &FH = Records.getFileHeader();
+  Trace.Header = {FH.Version, FH.Type, FH.ConstantTSC, FH.NonstopTSC,
+                  FH.CycleFrequency};
+  Trace.Records.reserve(Records.size());
+  for (const auto &R : Records) {
+    Trace.Records.push_back({R.RecordType, R.CPU, R.Type, R.FuncId,
+                             Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
+                                       : std::to_string(R.FuncId),
+                             R.TSC, R.TId});
+  }
+  Output Out(OS);
+  Out << Trace;
+}
+
+void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
+  // First write out the file header, in the correct endian-appropriate format
+  // (XRay assumes currently little endian).
+  support::endian::Writer<support::endianness::little> Writer(OS);
+  const auto &FH = Records.getFileHeader();
+  Writer.write(FH.Version);
+  Writer.write(FH.Type);
+  uint32_t Bitfield{0};
+  if (FH.ConstantTSC)
+    Bitfield |= 1uL;
+  if (FH.NonstopTSC)
+    Bitfield |= 1uL << 1;
+  Writer.write(Bitfield);
+  Writer.write(FH.CycleFrequency);
+
+  // There's 16 bytes of padding at the end of the file header.
+  static constexpr uint32_t Padding4B = 0;
+  Writer.write(Padding4B);
+  Writer.write(Padding4B);
+  Writer.write(Padding4B);
+  Writer.write(Padding4B);
+
+  // Then write out the rest of the records, still in an endian-appropriate
+  // format.
+  for (const auto &R : Records) {
+    Writer.write(R.RecordType);
+    Writer.write(R.CPU);
+    switch (R.Type) {
+    case RecordTypes::ENTER:
+      Writer.write(uint8_t{0});
+      break;
+    case RecordTypes::EXIT:
+      Writer.write(uint8_t{1});
+      break;
+    }
+    Writer.write(R.FuncId);
+    Writer.write(R.TSC);
+    Writer.write(R.TId);
+    Writer.write(Padding4B);
+    Writer.write(Padding4B);
+    Writer.write(Padding4B);
+  }
+}
+
+namespace llvm {
+namespace xray {
+
+static CommandRegistration Unused(&Convert, []() -> Error {
+  // FIXME: Support conversion to BINARY when upgrading XRay trace versions.
+  int Fd;
+  auto EC = sys::fs::openFileForRead(ConvertInput, Fd);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Cannot open file '") + ConvertInput + "'", EC);
+
+  Error Err = Error::success();
+  xray::InstrumentationMapExtractor Extractor(ConvertInstrMap, InstrMapFormat,
+                                              Err);
+  handleAllErrors(std::move(Err),
+                  [&](const ErrorInfoBase &E) { E.log(errs()); });
+
+  const auto &FunctionAddresses = Extractor.getFunctionAddresses();
+  symbolize::LLVMSymbolizer::Options Opts(
+      symbolize::FunctionNameKind::LinkageName, true, true, false, "");
+  symbolize::LLVMSymbolizer Symbolizer(Opts);
+  llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
+                                                  FunctionAddresses);
+  llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
+  raw_fd_ostream OS(ConvertOutput, EC,
+                    ConvertOutputFormat == ConvertFormats::BINARY
+                        ? sys::fs::OpenFlags::F_None
+                        : sys::fs::OpenFlags::F_Text);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Cannot open file '") + ConvertOutput + "' for writing.", EC);
+
+  if (auto TraceOrErr = loadTraceFile(ConvertInput, ConvertSortInput)) {
+    auto &T = *TraceOrErr;
+    switch (ConvertOutputFormat) {
+    case ConvertFormats::YAML:
+      TC.exportAsYAML(T, OS);
+      break;
+    case ConvertFormats::BINARY:
+      TC.exportAsRAWv1(T, OS);
+      break;
+    }
+  } else {
+    return joinErrors(
+        make_error<StringError>(
+            Twine("Failed loading input file '") + ConvertInput + "'.",
+            std::make_error_code(std::errc::executable_format_error)),
+        TraceOrErr.takeError());
+  }
+  return Error::success();
+});
+
+} // namespace xray
+} // namespace llvm
diff --git a/tools/llvm-xray/xray-converter.h b/tools/llvm-xray/xray-converter.h
new file mode 100644
index 000000000000..fa0d5e132f14
--- /dev/null
+++ b/tools/llvm-xray/xray-converter.h
@@ -0,0 +1,39 @@
+//===- xray-converter.h - XRay Trace Conversion ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the TraceConverter class for turning binary traces into
+// human-readable text and vice versa.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
+#define LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
+
+#include "func-id-helper.h"
+#include "llvm/XRay/XRayRecord.h"
+#include "llvm/XRay/Trace.h"
+
+namespace llvm {
+namespace xray {
+
+class TraceConverter {
+  FuncIdConversionHelper &FuncIdHelper;
+  bool Symbolize;
+
+public:
+  TraceConverter(FuncIdConversionHelper &FuncIdHelper, bool Symbolize = false)
+      : FuncIdHelper(FuncIdHelper), Symbolize(Symbolize) {}
+
+  void exportAsYAML(const Trace &Records, raw_ostream &OS);
+  void exportAsRAWv1(const Trace &Records, raw_ostream &OS);
+};
+
+} // namespace xray
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
diff --git a/tools/llvm-xray/xray-extract.cc b/tools/llvm-xray/xray-extract.cc
index e51b64c8ad6e..49ecd7421137 100644
--- a/tools/llvm-xray/xray-extract.cc
+++ b/tools/llvm-xray/xray-extract.cc
@@ -162,8 +162,7 @@ llvm::Error LoadBinaryInstrELF(
               "'.",
           std::make_error_code(std::errc::executable_format_error));
     }
-    auto AlwaysInstrument = Extractor.getU8(&OffsetPtr);
-    Entry.AlwaysInstrument = AlwaysInstrument != 0;
+    Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
 
     // We replicate the function id generation scheme implemented in the runtime
     // here. Ideally we should be able to break it out, or output this map from
@@ -185,30 +184,82 @@ llvm::Error LoadBinaryInstrELF(
   return llvm::Error::success();
 }
 
+Error LoadYAMLInstrMap(
+    StringRef Filename, std::deque<SledEntry> &Sleds,
+    InstrumentationMapExtractor::FunctionAddressMap &InstrMap,
+    InstrumentationMapExtractor::FunctionAddressReverseMap &FunctionIds) {
+  int Fd;
+  if (auto EC = sys::fs::openFileForRead(Filename, Fd))
+    return make_error<StringError>(
+        Twine("Failed opening file '") + Filename + "' for reading.", EC);
+
+  uint64_t FileSize;
+  if (auto EC = sys::fs::file_size(Filename, FileSize))
+    return make_error<StringError>(
+        Twine("Failed getting size of file '") + Filename + "'.", EC);
+
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Failed memory-mapping file '") + Filename + "'.", EC);
+
+  std::vector<YAMLXRaySledEntry> YAMLSleds;
+  Input In(StringRef(MappedFile.data(), MappedFile.size()));
+  In >> YAMLSleds;
+  if (In.error())
+    return make_error<StringError>(
+        Twine("Failed loading YAML document from '") + Filename + "'.",
+        In.error());
+
+  for (const auto &Y : YAMLSleds) {
+    InstrMap[Y.FuncId] = Y.Function;
+    FunctionIds[Y.Function] = Y.FuncId;
+    Sleds.push_back(
+        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+  }
+  return Error::success();
+}
+
 } // namespace
 
 InstrumentationMapExtractor::InstrumentationMapExtractor(std::string Filename,
                                                          InputFormats Format,
                                                          Error &EC) {
   ErrorAsOutParameter ErrAsOutputParam(&EC);
+  if (Filename.empty()) {
+    EC = Error::success();
+    return;
+  }
   switch (Format) {
   case InputFormats::ELF: {
     EC = handleErrors(
         LoadBinaryInstrELF(Filename, Sleds, FunctionAddresses, FunctionIds),
-        [](std::unique_ptr<ErrorInfoBase> E) {
+        [&](std::unique_ptr<ErrorInfoBase> E) {
           return joinErrors(
               make_error<StringError>(
                   Twine("Cannot extract instrumentation map from '") +
-                      ExtractInput + "'.",
+                      Filename + "'.",
                   std::make_error_code(std::errc::executable_format_error)),
               std::move(E));
         });
     break;
   }
-  default:
-    llvm_unreachable("Input format type not supported yet.");
+  case InputFormats::YAML: {
+    EC = handleErrors(
+        LoadYAMLInstrMap(Filename, Sleds, FunctionAddresses, FunctionIds),
+        [&](std::unique_ptr<ErrorInfoBase> E) {
+          return joinErrors(
+              make_error<StringError>(
+                  Twine("Cannot load YAML instrumentation map from '") +
+                      Filename + "'.",
+                  std::make_error_code(std::errc::executable_format_error)),
+              std::move(E));
+        });
     break;
   }
+  }
 }
 
 void InstrumentationMapExtractor::exportAsYAML(raw_ostream &OS) {
diff --git a/tools/llvm-xray/xray-record-yaml.h b/tools/llvm-xray/xray-record-yaml.h
new file mode 100644
index 000000000000..abce8ff60a94
--- /dev/null
+++ b/tools/llvm-xray/xray-record-yaml.h
@@ -0,0 +1,102 @@
+//===- xray-record-yaml.h - XRay Record YAML Support Definitions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Types and traits specialisations for YAML I/O of XRay log entries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLVM_XRAY_XRAY_RECORD_YAML_H
+#define LLVM_TOOLS_LLVM_XRAY_XRAY_RECORD_YAML_H
+
+#include <type_traits>
+
+#include "xray-record.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+namespace xray {
+
+struct YAMLXRayFileHeader {
+  uint16_t Version;
+  uint16_t Type;
+  bool ConstantTSC;
+  bool NonstopTSC;
+  uint64_t CycleFrequency;
+};
+
+struct YAMLXRayRecord {
+  uint16_t RecordType;
+  uint8_t CPU;
+  RecordTypes Type;
+  int32_t FuncId;
+  std::string Function;
+  uint64_t TSC;
+  uint32_t TId;
+};
+
+struct YAMLXRayTrace {
+  YAMLXRayFileHeader Header;
+  std::vector<YAMLXRayRecord> Records;
+};
+
+using XRayRecordStorage =
+    std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
+
+} // namespace xray
+
+namespace yaml {
+
+// YAML Traits
+// -----------
+template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+  static void enumeration(IO &IO, xray::RecordTypes &Type) {
+    IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
+    IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
+  }
+};
+
+template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+  static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
+    IO.mapRequired("version", Header.Version);
+    IO.mapRequired("type", Header.Type);
+    IO.mapRequired("constant-tsc", Header.ConstantTSC);
+    IO.mapRequired("nonstop-tsc", Header.NonstopTSC);
+    IO.mapRequired("cycle-frequency", Header.CycleFrequency);
+  }
+};
+
+template <> struct MappingTraits<xray::YAMLXRayRecord> {
+  static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
+    // FIXME: Make this type actually be descriptive
+    IO.mapRequired("type", Record.RecordType);
+    IO.mapRequired("func-id", Record.FuncId);
+    IO.mapOptional("function", Record.Function);
+    IO.mapRequired("cpu", Record.CPU);
+    IO.mapRequired("thread", Record.TId);
+    IO.mapRequired("kind", Record.Type);
+    IO.mapRequired("tsc", Record.TSC);
+  }
+
+  static constexpr bool flow = true;
+};
+
+template <> struct MappingTraits<xray::YAMLXRayTrace> {
+  static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
+    // A trace file contains two parts, the header and the list of all the
+    // trace records.
+    IO.mapRequired("header", Trace.Header);
+    IO.mapRequired("records", Trace.Records);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord) 
+
+#endif // LLVM_TOOLS_LLVM_XRAY_XRAY_RECORD_YAML_H
diff --git a/tools/obj2yaml/dwarf2yaml.cpp b/tools/obj2yaml/dwarf2yaml.cpp
index cf8b3e5b9273..cbf34ed5388a 100644
--- a/tools/obj2yaml/dwarf2yaml.cpp
+++ b/tools/obj2yaml/dwarf2yaml.cpp
@@ -127,7 +127,7 @@ void dumpDebugInfo(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
           NewValue.Value = 0xDEADBEEFDEADBEEF;
           DWARFDie DIEWrapper(CU.get(), &DIE);
           auto FormValue = DIEWrapper.getAttributeValue(AttrSpec.Attr);
-          if(!FormValue)
+          if (!FormValue)
             return;
           auto Form = FormValue.getValue().getForm();
           bool indirect = false;
@@ -211,11 +211,137 @@ void dumpDebugInfo(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
   }
 }
 
+bool dumpFileEntry(DataExtractor &Data, uint32_t &Offset,
+                   DWARFYAML::File &File) {
+  File.Name = Data.getCStr(&Offset);
+  if (File.Name.empty())
+    return false;
+  File.DirIdx = Data.getULEB128(&Offset);
+  File.ModTime = Data.getULEB128(&Offset);
+  File.Length = Data.getULEB128(&Offset);
+  return true;
+}
+
+void dumpDebugLines(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
+  for (const auto &CU : DCtx.compile_units()) {
+    auto CUDIE = CU->getUnitDIE();
+    if (!CUDIE)
+      continue;
+    if (auto StmtOffset =
+            CUDIE.getAttributeValueAsSectionOffset(dwarf::DW_AT_stmt_list)) {
+      DWARFYAML::LineTable DebugLines;
+      DataExtractor LineData(DCtx.getLineSection().Data, DCtx.isLittleEndian(),
+                             CU->getAddressByteSize());
+      uint32_t Offset = *StmtOffset;
+      uint64_t SizeOfPrologueLength = 4;
+      DebugLines.TotalLength = LineData.getU32(&Offset);
+      uint64_t LineTableLength = DebugLines.TotalLength;
+      if (DebugLines.TotalLength == UINT32_MAX) {
+        DebugLines.TotalLength64 = LineData.getU64(&Offset);
+        LineTableLength = DebugLines.TotalLength64;
+        SizeOfPrologueLength = 8;
+      }
+      DebugLines.Version = LineData.getU16(&Offset);
+      DebugLines.PrologueLength =
+          LineData.getUnsigned(&Offset, SizeOfPrologueLength);
+      const uint64_t EndPrologue = DebugLines.PrologueLength + Offset;
+
+      DebugLines.MinInstLength = LineData.getU8(&Offset);
+      if (DebugLines.Version >= 4)
+        DebugLines.MaxOpsPerInst = LineData.getU8(&Offset);
+      DebugLines.DefaultIsStmt = LineData.getU8(&Offset);
+      DebugLines.LineBase = LineData.getU8(&Offset);
+      DebugLines.LineRange = LineData.getU8(&Offset);
+      DebugLines.OpcodeBase = LineData.getU8(&Offset);
+
+      DebugLines.StandardOpcodeLengths.reserve(DebugLines.OpcodeBase - 1);
+      for (uint8_t i = 1; i < DebugLines.OpcodeBase; ++i)
+        DebugLines.StandardOpcodeLengths.push_back(LineData.getU8(&Offset));
+
+      while (Offset < EndPrologue) {
+        StringRef Dir = LineData.getCStr(&Offset);
+        if (!Dir.empty())
+          DebugLines.IncludeDirs.push_back(Dir);
+        else
+          break;
+      }
+
+      while (Offset < EndPrologue) {
+        DWARFYAML::File TmpFile;
+        if (dumpFileEntry(LineData, Offset, TmpFile))
+          DebugLines.Files.push_back(TmpFile);
+        else
+          break;
+      }
+
+      const uint64_t LineEnd =
+          LineTableLength + *StmtOffset + SizeOfPrologueLength;
+      while (Offset < LineEnd) {
+        DWARFYAML::LineTableOpcode NewOp;
+        NewOp.Opcode = (dwarf::LineNumberOps)LineData.getU8(&Offset);
+        if (NewOp.Opcode == 0) {
+          auto StartExt = Offset;
+          NewOp.ExtLen = LineData.getULEB128(&Offset);
+          NewOp.SubOpcode =
+              (dwarf::LineNumberExtendedOps)LineData.getU8(&Offset);
+          switch (NewOp.SubOpcode) {
+          case dwarf::DW_LNE_set_address:
+          case dwarf::DW_LNE_set_discriminator:
+            NewOp.Data = LineData.getAddress(&Offset);
+            break;
+          case dwarf::DW_LNE_define_file:
+            dumpFileEntry(LineData, Offset, NewOp.FileEntry);
+            break;
+          case dwarf::DW_LNE_end_sequence:
+            break;
+          default:
+            while (Offset < StartExt + NewOp.ExtLen)
+              NewOp.UnknownOpcodeData.push_back(LineData.getU8(&Offset));
+          }
+        } else if (NewOp.Opcode < DebugLines.OpcodeBase) {
+          switch (NewOp.Opcode) {
+          case dwarf::DW_LNS_copy:
+          case dwarf::DW_LNS_negate_stmt:
+          case dwarf::DW_LNS_set_basic_block:
+          case dwarf::DW_LNS_const_add_pc:
+          case dwarf::DW_LNS_set_prologue_end:
+          case dwarf::DW_LNS_set_epilogue_begin:
+            break;
+
+          case dwarf::DW_LNS_advance_pc:
+          case dwarf::DW_LNS_set_file:
+          case dwarf::DW_LNS_set_column:
+          case dwarf::DW_LNS_set_isa:
+            NewOp.Data = LineData.getULEB128(&Offset);
+            break;
+
+          case dwarf::DW_LNS_advance_line:
+            NewOp.SData = LineData.getSLEB128(&Offset);
+            break;
+
+          case dwarf::DW_LNS_fixed_advance_pc:
+            NewOp.Data = LineData.getU16(&Offset);
+            break;
+
+          default:
+            for (uint8_t i = 0;
+                 i < DebugLines.StandardOpcodeLengths[NewOp.Opcode - 1]; ++i)
+              NewOp.StandardOpcodeData.push_back(LineData.getULEB128(&Offset));
+          }
+        }
+        DebugLines.Opcodes.push_back(NewOp);
+      }
+      Y.DebugLines.push_back(DebugLines);
+    }
+  }
+}
+
 std::error_code dwarf2yaml(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
   dumpDebugAbbrev(DCtx, Y);
   dumpDebugStrings(DCtx, Y);
   dumpDebugARanges(DCtx, Y);
   dumpDebugPubSections(DCtx, Y);
   dumpDebugInfo(DCtx, Y);
+  dumpDebugLines(DCtx, Y);
   return obj2yaml_error::success;
 }
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index acdf2639b3c7..df467da690e7 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
@@ -30,6 +29,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 using namespace llvm;
 using namespace opt_tool;
diff --git a/tools/yaml2obj/yaml2dwarf.cpp b/tools/yaml2obj/yaml2dwarf.cpp
index 8ba1190c56a7..3ceb7772b969 100644
--- a/tools/yaml2obj/yaml2dwarf.cpp
+++ b/tools/yaml2obj/yaml2dwarf.cpp
@@ -233,3 +233,98 @@ void yaml2debug_info(raw_ostream &OS, const DWARFYAML::Data &DI) {
     }
   }
 }
+
+void yaml2FileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
+  OS.write(File.Name.data(), File.Name.size());
+  OS.write('\0');
+  encodeULEB128(File.DirIdx, OS);
+  encodeULEB128(File.ModTime, OS);
+  encodeULEB128(File.Length, OS);
+}
+
+void yaml2debug_line(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (const auto LineTable : DI.DebugLines) {
+    writeInteger((uint32_t)LineTable.TotalLength, OS, DI.IsLittleEndian);
+    uint64_t SizeOfPrologueLength = 4;
+    if (LineTable.TotalLength == UINT32_MAX) {
+      writeInteger((uint64_t)LineTable.TotalLength64, OS, DI.IsLittleEndian);
+      SizeOfPrologueLength = 8;
+    }
+    writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
+    writeVariableSizedInteger(LineTable.PrologueLength, SizeOfPrologueLength,
+                              OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian);
+    if (LineTable.Version >= 4)
+      writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.DefaultIsStmt, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineBase, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineRange, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.OpcodeBase, OS, DI.IsLittleEndian);
+
+    for (auto OpcodeLength : LineTable.StandardOpcodeLengths)
+      writeInteger((uint8_t)OpcodeLength, OS, DI.IsLittleEndian);
+
+    for (auto IncludeDir : LineTable.IncludeDirs) {
+      OS.write(IncludeDir.data(), IncludeDir.size());
+      OS.write('\0');
+    }
+    OS.write('\0');
+
+    for (auto File : LineTable.Files)
+      yaml2FileEntry(OS, File);
+    OS.write('\0');
+
+    for (auto Op : LineTable.Opcodes) {
+      writeInteger((uint8_t)Op.Opcode, OS, DI.IsLittleEndian);
+      if (Op.Opcode == 0) {
+        encodeULEB128(Op.ExtLen, OS);
+        writeInteger((uint8_t)Op.SubOpcode, OS, DI.IsLittleEndian);
+        switch (Op.SubOpcode) {
+        case dwarf::DW_LNE_set_address:
+        case dwarf::DW_LNE_set_discriminator:
+          writeVariableSizedInteger(Op.Data, DI.CompileUnits[0].AddrSize, OS,
+                                    DI.IsLittleEndian);
+          break;
+        case dwarf::DW_LNE_define_file:
+          yaml2FileEntry(OS, Op.FileEntry);
+          break;
+        case dwarf::DW_LNE_end_sequence:
+          break;
+        default:
+          for (auto OpByte : Op.UnknownOpcodeData)
+            writeInteger((uint8_t)OpByte, OS, DI.IsLittleEndian);
+        }
+      } else if (Op.Opcode < LineTable.OpcodeBase) {
+        switch (Op.Opcode) {
+        case dwarf::DW_LNS_copy:
+        case dwarf::DW_LNS_negate_stmt:
+        case dwarf::DW_LNS_set_basic_block:
+        case dwarf::DW_LNS_const_add_pc:
+        case dwarf::DW_LNS_set_prologue_end:
+        case dwarf::DW_LNS_set_epilogue_begin:
+          break;
+
+        case dwarf::DW_LNS_advance_pc:
+        case dwarf::DW_LNS_set_file:
+        case dwarf::DW_LNS_set_column:
+        case dwarf::DW_LNS_set_isa:
+          encodeULEB128(Op.Data, OS);
+          break;
+
+        case dwarf::DW_LNS_advance_line:
+          encodeSLEB128(Op.SData, OS);
+          break;
+
+        case dwarf::DW_LNS_fixed_advance_pc:
+          writeInteger((uint16_t)Op.Data, OS, DI.IsLittleEndian);
+          break;
+
+        default:
+          for (auto OpData : Op.StandardOpcodeData) {
+            encodeULEB128(OpData, OS);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tools/yaml2obj/yaml2macho.cpp b/tools/yaml2obj/yaml2macho.cpp
index a41ec55d73be..cbc4d7ff50d5 100644
--- a/tools/yaml2obj/yaml2macho.cpp
+++ b/tools/yaml2obj/yaml2macho.cpp
@@ -280,6 +280,8 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) {
             yaml2pubsection(OS, Obj.DWARF.PubTypes, Obj.IsLittleEndian);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16)) {
             yaml2debug_info(OS, Obj.DWARF);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16)) {
+            yaml2debug_line(OS, Obj.DWARF);
           }
         } else {
           // Fills section data with 0xDEADBEEF
diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index 7cad4ca8675f..4a637366e1a1 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h
@@ -46,5 +46,6 @@ void yaml2pubsection(llvm::raw_ostream &OS,
                      const llvm::DWARFYAML::PubSection &Sect,
                      bool IsLittleEndian);
 void yaml2debug_info(llvm::raw_ostream &OS, const llvm::DWARFYAML::Data &DI);
+void yaml2debug_line(llvm::raw_ostream &OS, const llvm::DWARFYAML::Data &DI);
 
 #endif
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index 65a2ac094cff..ff4c17ee3b6b 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -13,7 +13,6 @@ add_llvm_unittest(AnalysisTests
   CFGTest.cpp
   CGSCCPassManagerTest.cpp
   LazyCallGraphTest.cpp
-  LoopPassManagerTest.cpp
   MemoryBuiltinsTest.cpp
   ScalarEvolutionTest.cpp
   TBAATest.cpp
diff --git a/unittests/Analysis/LoopPassManagerTest.cpp b/unittests/Analysis/LoopPassManagerTest.cpp
deleted file mode 100644
index 092e4bf91133..000000000000
--- a/unittests/Analysis/LoopPassManagerTest.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-//===- llvm/unittest/Analysis/LoopPassManagerTest.cpp - LPM tests ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/SourceMgr.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-class TestLoopAnalysis : public AnalysisInfoMixin<TestLoopAnalysis> {
-  friend AnalysisInfoMixin<TestLoopAnalysis>;
-  static AnalysisKey Key;
-
-  int &Runs;
-
-public:
-  struct Result {
-    Result(int Count) : BlockCount(Count) {}
-    int BlockCount;
-  };
-
-  TestLoopAnalysis(int &Runs) : Runs(Runs) {}
-
-  /// \brief Run the analysis pass over the loop and return a result.
-  Result run(Loop &L, LoopAnalysisManager &AM) {
-    ++Runs;
-    int Count = 0;
-
-    for (auto I = L.block_begin(), E = L.block_end(); I != E; ++I)
-      ++Count;
-    return Result(Count);
-  }
-};
-
-AnalysisKey TestLoopAnalysis::Key;
-
-class TestLoopPass {
-  std::vector<StringRef> &VisitedLoops;
-  int &AnalyzedBlockCount;
-  bool OnlyUseCachedResults;
-
-public:
-  TestLoopPass(std::vector<StringRef> &VisitedLoops, int &AnalyzedBlockCount,
-               bool OnlyUseCachedResults = false)
-      : VisitedLoops(VisitedLoops), AnalyzedBlockCount(AnalyzedBlockCount),
-        OnlyUseCachedResults(OnlyUseCachedResults) {}
-
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM) {
-    VisitedLoops.push_back(L.getName());
-
-    if (OnlyUseCachedResults) {
-      // Hack to force the use of the cached interface.
-      if (auto *AR = AM.getCachedResult<TestLoopAnalysis>(L))
-        AnalyzedBlockCount += AR->BlockCount;
-    } else {
-      // Typical path just runs the analysis as needed.
-      auto &AR = AM.getResult<TestLoopAnalysis>(L);
-      AnalyzedBlockCount += AR.BlockCount;
-    }
-
-    return PreservedAnalyses::all();
-  }
-
-  static StringRef name() { return "TestLoopPass"; }
-};
-
-// A test loop pass that invalidates the analysis for loops with the given name.
-class TestLoopInvalidatingPass {
-  StringRef Name;
-
-public:
-  TestLoopInvalidatingPass(StringRef LoopName) : Name(LoopName) {}
-
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM) {
-    return L.getName() == Name ? getLoopPassPreservedAnalyses()
-                               : PreservedAnalyses::all();
-  }
-
-  static StringRef name() { return "TestLoopInvalidatingPass"; }
-};
-
-std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
-  SMDiagnostic Err;
-  return parseAssemblyString(IR, Err, C);
-}
-
-class LoopPassManagerTest : public ::testing::Test {
-protected:
-  LLVMContext Context;
-  std::unique_ptr<Module> M;
-
-public:
-  LoopPassManagerTest()
-      : M(parseIR(Context, "define void @f() {\n"
-                           "entry:\n"
-                           "  br label %loop.0\n"
-                           "loop.0:\n"
-                           "  br i1 undef, label %loop.0.0, label %end\n"
-                           "loop.0.0:\n"
-                           "  br i1 undef, label %loop.0.0, label %loop.0.1\n"
-                           "loop.0.1:\n"
-                           "  br i1 undef, label %loop.0.1, label %loop.0\n"
-                           "end:\n"
-                           "  ret void\n"
-                           "}\n"
-                           "\n"
-                           "define void @g() {\n"
-                           "entry:\n"
-                           "  br label %loop.g.0\n"
-                           "loop.g.0:\n"
-                           "  br i1 undef, label %loop.g.0, label %end\n"
-                           "end:\n"
-                           "  ret void\n"
-                           "}\n")) {}
-};
-
-#define EXPECT_N_ELEMENTS_EQ(N, EXPECTED, ACTUAL)                              \
-  do {                                                                         \
-    EXPECT_EQ(N##UL, ACTUAL.size());                                           \
-    for (int I = 0; I < N; ++I)                                                \
-      EXPECT_TRUE(EXPECTED[I] == ACTUAL[I]) << "Element " << I << " is "       \
-                                            << ACTUAL[I] << ". Expected "      \
-                                            << EXPECTED[I] << ".";             \
-  } while (0)
-
-TEST_F(LoopPassManagerTest, Basic) {
-  LoopAnalysisManager LAM(true);
-  int LoopAnalysisRuns = 0;
-  LAM.registerPass([&] { return TestLoopAnalysis(LoopAnalysisRuns); });
-
-  FunctionAnalysisManager FAM(true);
-  // We need DominatorTreeAnalysis for LoopAnalysis.
-  FAM.registerPass([&] { return DominatorTreeAnalysis(); });
-  FAM.registerPass([&] { return LoopAnalysis(); });
-  // We also allow loop passes to assume a set of other analyses and so need
-  // those.
-  FAM.registerPass([&] { return AAManager(); });
-  FAM.registerPass([&] { return TargetLibraryAnalysis(); });
-  FAM.registerPass([&] { return ScalarEvolutionAnalysis(); });
-  FAM.registerPass([&] { return AssumptionAnalysis(); });
-  FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
-  LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
-
-  ModuleAnalysisManager MAM(true);
-  MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
-  FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
-
-  ModulePassManager MPM(true);
-  FunctionPassManager FPM(true);
-
-  // Visit all of the loops.
-  std::vector<StringRef> VisitedLoops1;
-  int AnalyzedBlockCount1 = 0;
-  {
-    LoopPassManager LPM;
-    LPM.addPass(TestLoopPass(VisitedLoops1, AnalyzedBlockCount1));
-
-    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
-  }
-
-  // Only use cached analyses.
-  std::vector<StringRef> VisitedLoops2;
-  int AnalyzedBlockCount2 = 0;
-  {
-    LoopPassManager LPM;
-    LPM.addPass(TestLoopInvalidatingPass("loop.g.0"));
-    LPM.addPass(TestLoopPass(VisitedLoops2, AnalyzedBlockCount2,
-                             /*OnlyUseCachedResults=*/true));
-
-    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
-  }
-
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-  MPM.run(*M, MAM);
-
-  StringRef ExpectedLoops[] = {"loop.0.0", "loop.0.1", "loop.0", "loop.g.0"};
-
-  // Validate the counters and order of loops visited.
-  // loop.0 has 3 blocks whereas loop.0.0, loop.0.1, and loop.g.0 each have 1.
-  EXPECT_N_ELEMENTS_EQ(4, ExpectedLoops, VisitedLoops1);
-  EXPECT_EQ(6, AnalyzedBlockCount1);
-
-  EXPECT_N_ELEMENTS_EQ(4, ExpectedLoops, VisitedLoops2);
-  // The block from loop.g.0 won't be counted, since it wasn't cached.
-  EXPECT_EQ(5, AnalyzedBlockCount2);
-
-  // The first LPM runs the loop analysis for all four loops, the second uses
-  // cached results for everything.
-  EXPECT_EQ(4, LoopAnalysisRuns);
-}
-}
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 4435b7942682..a4109a34097f 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -77,6 +77,7 @@ void TestAllForms() {
   const uint64_t Data8 = 0x0011223344556677ULL;
   const uint64_t Data8_2 = 0xAABBCCDDEEFF0011ULL;
   const int64_t SData = INT64_MIN;
+  const int64_t ICSData = INT64_MAX; // DW_FORM_implicit_const SData
   const uint64_t UData[] = {UINT64_MAX - 1, UINT64_MAX - 2, UINT64_MAX - 3,
                             UINT64_MAX - 4, UINT64_MAX - 5, UINT64_MAX - 6,
                             UINT64_MAX - 7, UINT64_MAX - 8, UINT64_MAX - 9};
@@ -181,6 +182,12 @@ void TestAllForms() {
   const auto Attr_DW_FORM_sdata = static_cast<dwarf::Attribute>(Attr++);
   CUDie.addAttribute(Attr_DW_FORM_sdata, DW_FORM_sdata, SData);
 
+  const auto Attr_DW_FORM_implicit_const =
+    static_cast<dwarf::Attribute>(Attr++);
+  if (Version >= 5)
+    CUDie.addAttribute(Attr_DW_FORM_implicit_const, DW_FORM_implicit_const,
+                       ICSData);
+
   //----------------------------------------------------------------------
   // Test ULEB128 based forms
   //----------------------------------------------------------------------
@@ -221,7 +228,7 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   // Test address forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(DieDG.getAttributeValueAsAddress(Attr_DW_FORM_addr, 0),
+  EXPECT_EQ(DieDG.getAttributeValueAsAddress(Attr_DW_FORM_addr).getValueOr(0),
             AddrValue);
 
   //----------------------------------------------------------------------
@@ -266,18 +273,18 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   // Test data forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data1, 0),
-      Data1);
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data2, 0),
-      Data2);
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data4, 0),
-      Data4);
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data8, 0),
-      Data8);
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data1)
+                .getValueOr(0),
+            Data1);
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data2)
+                .getValueOr(0),
+            Data2);
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data4)
+                .getValueOr(0),
+            Data4);
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_data8)
+                .getValueOr(0),
+            Data8);
 
   //----------------------------------------------------------------------
   // Test string forms
@@ -295,63 +302,71 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   // Test reference forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_addr, 0),
-            RefAddr);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref1, 0),
+  EXPECT_EQ(
+      DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_addr).getValueOr(0),
+      RefAddr);
+  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref1).getValueOr(0),
             Data1);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref2, 0),
+  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref2).getValueOr(0),
             Data2);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref4, 0),
+  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref4).getValueOr(0),
             Data4);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref8, 0),
+  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref8).getValueOr(0),
             Data8);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_sig8, 0),
-            Data8_2);
-  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_udata, 0),
-            UData[0]);
+  EXPECT_EQ(
+      DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_sig8).getValueOr(0),
+      Data8_2);
+  EXPECT_EQ(
+      DieDG.getAttributeValueAsReference(Attr_DW_FORM_ref_udata).getValueOr(0),
+      UData[0]);
 
   //----------------------------------------------------------------------
   // Test flag forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(
-                Attr_DW_FORM_flag_true, 0ULL),
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_flag_true)
+                .getValueOr(0),
             1ULL);
-  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(
-                Attr_DW_FORM_flag_false, 1ULL),
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_flag_false)
+                .getValueOr(1),
             0ULL);
-  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(
-                Attr_DW_FORM_flag_present, 0ULL),
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_flag_present)
+                .getValueOr(0ULL),
             1ULL);
 
-  // TODO: test Attr_DW_FORM_implicit_const extraction
-
   //----------------------------------------------------------------------
   // Test SLEB128 based forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(DieDG.getAttributeValueAsSignedConstant(Attr_DW_FORM_sdata, 0),
-            SData);
+  EXPECT_EQ(
+      DieDG.getAttributeValueAsSignedConstant(Attr_DW_FORM_sdata).getValueOr(0),
+      SData);
+  if (Version >= 5)
+    EXPECT_EQ(
+        DieDG.getAttributeValueAsSignedConstant(Attr_DW_FORM_implicit_const)
+            .getValueOr(0),
+        ICSData);
 
   //----------------------------------------------------------------------
   // Test ULEB128 based forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_udata, 0),
-      UData[0]);
+  EXPECT_EQ(DieDG.getAttributeValueAsUnsignedConstant(Attr_DW_FORM_udata)
+                .getValueOr(0),
+            UData[0]);
 
   //----------------------------------------------------------------------
   // Test DWARF32/DWARF64 forms
   //----------------------------------------------------------------------
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsReference(Attr_DW_FORM_GNU_ref_alt, 0),
-      Dwarf32Values[0]);
-  EXPECT_EQ(
-      DieDG.getAttributeValueAsSectionOffset(Attr_DW_FORM_sec_offset, 0),
-      Dwarf32Values[1]);
+  EXPECT_EQ(DieDG.getAttributeValueAsReference(Attr_DW_FORM_GNU_ref_alt)
+                .getValueOr(0),
+            Dwarf32Values[0]);
+  EXPECT_EQ(DieDG.getAttributeValueAsSectionOffset(Attr_DW_FORM_sec_offset)
+                .getValueOr(0),
+            Dwarf32Values[1]);
 
   //----------------------------------------------------------------------
   // Add an address at the end to make sure we can decode this value
   //----------------------------------------------------------------------
-  EXPECT_EQ(DieDG.getAttributeValueAsAddress(Attr_Last, 0), AddrValue);
+  EXPECT_EQ(DieDG.getAttributeValueAsAddress(Attr_Last).getValueOr(0),
+            AddrValue);
 }
 
 TEST(DWARFDebugInfo, TestDWARF32Version2Addr4AllForms) {
@@ -408,6 +423,24 @@ TEST(DWARFDebugInfo, TestDWARF32Version4Addr8AllForms) {
   TestAllForms<4, AddrType, RefAddrType>();
 }
 
+TEST(DWARFDebugInfo, TestDWARF32Version5Addr4AllForms) {
+  // Test that we can decode all forms for DWARF32, version 5, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later
+  typedef uint32_t RefAddrType;
+  TestAllForms<5, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version5Addr8AllForms) {
+  // Test that we can decode all forms for DWARF32, version 5, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later
+  typedef uint32_t RefAddrType;
+  TestAllForms<5, AddrType, RefAddrType>();
+}
+
 template <uint16_t Version, class AddrType> void TestChildren() {
   // Test that we can decode DW_FORM_ref_addr values correctly in DWARF 2 with
   // 4 byte addresses. DW_FORM_ref_addr values should be 4 bytes when using
@@ -639,65 +672,69 @@ template <uint16_t Version, class AddrType> void TestReferences() {
   auto CU1TypeDieDG = Unit1DieDG.getFirstChild();
   EXPECT_TRUE(CU1TypeDieDG.isValid());
   EXPECT_EQ(CU1TypeDieDG.getTag(), DW_TAG_base_type);
-  EXPECT_EQ(
-      CU1TypeDieDG.getAttributeValueAsUnsignedConstant(DW_AT_encoding, 0),
-      DW_ATE_signed);
+  EXPECT_EQ(CU1TypeDieDG.getAttributeValueAsUnsignedConstant(DW_AT_encoding)
+                .getValueOr(0),
+            DW_ATE_signed);
 
   // Verify the first child of the compile unit 2 DIE is our float base type.
   auto CU2TypeDieDG = Unit2DieDG.getFirstChild();
   EXPECT_TRUE(CU2TypeDieDG.isValid());
   EXPECT_EQ(CU2TypeDieDG.getTag(), DW_TAG_base_type);
-  EXPECT_EQ(
-      CU2TypeDieDG.getAttributeValueAsUnsignedConstant(DW_AT_encoding, 0),
-      DW_ATE_float);
+  EXPECT_EQ(CU2TypeDieDG.getAttributeValueAsUnsignedConstant(DW_AT_encoding)
+                .getValueOr(0),
+            DW_ATE_float);
 
   // Verify the sibling of the base type DIE is our Ref1 DIE and that its
   // DW_AT_type points to our base type DIE.
   auto CU1Ref1DieDG = CU1TypeDieDG.getSibling();
   EXPECT_TRUE(CU1Ref1DieDG.isValid());
   EXPECT_EQ(CU1Ref1DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU1Ref1DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU1TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU1Ref1DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU1TypeDieDG.getOffset());
   // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our
   // base type DIE in CU1.
   auto CU1Ref2DieDG = CU1Ref1DieDG.getSibling();
   EXPECT_TRUE(CU1Ref2DieDG.isValid());
   EXPECT_EQ(CU1Ref2DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU1Ref2DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU1TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU1Ref2DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU1TypeDieDG.getOffset());
 
   // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our
   // base type DIE in CU1.
   auto CU1Ref4DieDG = CU1Ref2DieDG.getSibling();
   EXPECT_TRUE(CU1Ref4DieDG.isValid());
   EXPECT_EQ(CU1Ref4DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU1Ref4DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU1TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU1Ref4DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU1TypeDieDG.getOffset());
 
   // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our
   // base type DIE in CU1.
   auto CU1Ref8DieDG = CU1Ref4DieDG.getSibling();
   EXPECT_TRUE(CU1Ref8DieDG.isValid());
   EXPECT_EQ(CU1Ref8DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU1Ref8DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU1TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU1Ref8DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU1TypeDieDG.getOffset());
 
   // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our
   // base type DIE in CU1.
   auto CU1RefAddrDieDG = CU1Ref8DieDG.getSibling();
   EXPECT_TRUE(CU1RefAddrDieDG.isValid());
   EXPECT_EQ(CU1RefAddrDieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(
-      CU1RefAddrDieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-      CU1TypeDieDG.getOffset());
+  EXPECT_EQ(CU1RefAddrDieDG.getAttributeValueAsReference(DW_AT_type)
+                .getValueOr(-1ULL),
+            CU1TypeDieDG.getOffset());
 
   // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its
   // DW_AT_type points to our base type DIE.
   auto CU1ToCU2RefAddrDieDG = CU1RefAddrDieDG.getSibling();
   EXPECT_TRUE(CU1ToCU2RefAddrDieDG.isValid());
   EXPECT_EQ(CU1ToCU2RefAddrDieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU1ToCU2RefAddrDieDG.getAttributeValueAsReference(DW_AT_type,
-                                                                -1ULL),
+  EXPECT_EQ(CU1ToCU2RefAddrDieDG.getAttributeValueAsReference(DW_AT_type)
+                .getValueOr(-1ULL),
             CU2TypeDieDG.getOffset());
 
   // Verify the sibling of the base type DIE is our Ref1 DIE and that its
@@ -705,48 +742,52 @@ template <uint16_t Version, class AddrType> void TestReferences() {
   auto CU2Ref1DieDG = CU2TypeDieDG.getSibling();
   EXPECT_TRUE(CU2Ref1DieDG.isValid());
   EXPECT_EQ(CU2Ref1DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU2Ref1DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU2TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU2Ref1DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU2TypeDieDG.getOffset());
   // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our
   // base type DIE in CU2.
   auto CU2Ref2DieDG = CU2Ref1DieDG.getSibling();
   EXPECT_TRUE(CU2Ref2DieDG.isValid());
   EXPECT_EQ(CU2Ref2DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU2Ref2DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU2TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU2Ref2DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU2TypeDieDG.getOffset());
 
   // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our
   // base type DIE in CU2.
   auto CU2Ref4DieDG = CU2Ref2DieDG.getSibling();
   EXPECT_TRUE(CU2Ref4DieDG.isValid());
   EXPECT_EQ(CU2Ref4DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU2Ref4DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU2TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU2Ref4DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU2TypeDieDG.getOffset());
 
   // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our
   // base type DIE in CU2.
   auto CU2Ref8DieDG = CU2Ref4DieDG.getSibling();
   EXPECT_TRUE(CU2Ref8DieDG.isValid());
   EXPECT_EQ(CU2Ref8DieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU2Ref8DieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-            CU2TypeDieDG.getOffset());
+  EXPECT_EQ(
+      CU2Ref8DieDG.getAttributeValueAsReference(DW_AT_type).getValueOr(-1ULL),
+      CU2TypeDieDG.getOffset());
 
   // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our
   // base type DIE in CU2.
   auto CU2RefAddrDieDG = CU2Ref8DieDG.getSibling();
   EXPECT_TRUE(CU2RefAddrDieDG.isValid());
   EXPECT_EQ(CU2RefAddrDieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(
-      CU2RefAddrDieDG.getAttributeValueAsReference(DW_AT_type, -1ULL),
-      CU2TypeDieDG.getOffset());
+  EXPECT_EQ(CU2RefAddrDieDG.getAttributeValueAsReference(DW_AT_type)
+                .getValueOr(-1ULL),
+            CU2TypeDieDG.getOffset());
 
   // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its
   // DW_AT_type points to our base type DIE.
   auto CU2ToCU1RefAddrDieDG = CU2RefAddrDieDG.getSibling();
   EXPECT_TRUE(CU2ToCU1RefAddrDieDG.isValid());
   EXPECT_EQ(CU2ToCU1RefAddrDieDG.getTag(), DW_TAG_variable);
-  EXPECT_EQ(CU2ToCU1RefAddrDieDG.getAttributeValueAsReference(DW_AT_type,
-                                                                -1ULL),
+  EXPECT_EQ(CU2ToCU1RefAddrDieDG.getAttributeValueAsReference(DW_AT_type)
+                .getValueOr(-1ULL),
             CU1TypeDieDG.getOffset());
 }
 
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 6c49deb32d94..ae9c2684212b 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -203,6 +203,16 @@ namespace llvm {
         EXPECT_EQ(DT->getNode(BB4)->getDFSNumIn(), 5UL);
         EXPECT_EQ(DT->getNode(BB4)->getDFSNumOut(), 6UL);
 
+        // Change root node
+        DT->verifyDomTree();
+        BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "new_entry",
+                                                  &F, BB0);
+        BranchInst::Create(BB0, NewEntry);
+        EXPECT_EQ(F.begin()->getName(), NewEntry->getName());
+        EXPECT_TRUE(&F.getEntryBlock() == NewEntry);
+        DT->setNewRoot(NewEntry);
+        DT->verifyDomTree();
+
         return false;
       }
       void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 579384c5a5f4..1812cd39d135 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -435,4 +435,73 @@ TEST_F(IRBuilderTest, DIImportedEntity) {
   EXPECT_TRUE(verifyModule(*M));
   EXPECT_TRUE(CU->getImportedEntities().size() == 2);
 }
+
+//  0: #define M0 V0          <-- command line definition
+//  0: main.c                 <-- main file
+//     3:   #define M1 V1     <-- M1 definition in main.c
+//     5:   #include "file.h" <-- inclusion of file.h from main.c
+//          1: #define M2     <-- M2 definition in file.h with no value
+//     7:   #undef M1 V1      <-- M1 un-definition in main.c
+TEST_F(IRBuilderTest, DIBuilderMacro) {
+  IRBuilder<> Builder(BB);
+  DIBuilder DIB(*M);
+  auto File1 = DIB.createFile("main.c", "/");
+  auto File2 = DIB.createFile("file.h", "/");
+  auto CU = DIB.createCompileUnit(
+      dwarf::DW_LANG_C, DIB.createFile("main.c", "/"), "llvm-c", true, "", 0);
+  auto MDef0 =
+      DIB.createMacro(nullptr, 0, dwarf::DW_MACINFO_define, "M0", "V0");
+  auto TMF1 = DIB.createTempMacroFile(nullptr, 0, File1);
+  auto MDef1 = DIB.createMacro(TMF1, 3, dwarf::DW_MACINFO_define, "M1", "V1");
+  auto TMF2 = DIB.createTempMacroFile(TMF1, 5, File2);
+  auto MDef2 = DIB.createMacro(TMF2, 1, dwarf::DW_MACINFO_define, "M2");
+  auto MUndef1 = DIB.createMacro(TMF1, 7, dwarf::DW_MACINFO_undef, "M1");
+
+  EXPECT_EQ(dwarf::DW_MACINFO_define, MDef1->getMacinfoType());
+  EXPECT_EQ(3u, MDef1->getLine());
+  EXPECT_EQ("M1", MDef1->getName());
+  EXPECT_EQ("V1", MDef1->getValue());
+
+  EXPECT_EQ(dwarf::DW_MACINFO_undef, MUndef1->getMacinfoType());
+  EXPECT_EQ(7u, MUndef1->getLine());
+  EXPECT_EQ("M1", MUndef1->getName());
+  EXPECT_EQ("", MUndef1->getValue());
+
+  EXPECT_EQ(dwarf::DW_MACINFO_start_file, TMF2->getMacinfoType());
+  EXPECT_EQ(5u, TMF2->getLine());
+  EXPECT_EQ(File2, TMF2->getFile());
+
+  DIB.finalize();
+
+  SmallVector<Metadata *, 4> Elements;
+  Elements.push_back(MDef2);
+  auto MF2 = DIMacroFile::get(Ctx, dwarf::DW_MACINFO_start_file, 5, File2,
+                              DIB.getOrCreateMacroArray(Elements));
+
+  Elements.clear();
+  Elements.push_back(MDef1);
+  Elements.push_back(MF2);
+  Elements.push_back(MUndef1);
+  auto MF1 = DIMacroFile::get(Ctx, dwarf::DW_MACINFO_start_file, 0, File1,
+                              DIB.getOrCreateMacroArray(Elements));
+
+  Elements.clear();
+  Elements.push_back(MDef0);
+  Elements.push_back(MF1);
+  auto MN0 = MDTuple::get(Ctx, Elements);
+  EXPECT_EQ(MN0, CU->getRawMacros());
+
+  Elements.clear();
+  Elements.push_back(MDef1);
+  Elements.push_back(MF2);
+  Elements.push_back(MUndef1);
+  auto MN1 = MDTuple::get(Ctx, Elements);
+  EXPECT_EQ(MN1, MF1->getRawElements());
+
+  Elements.clear();
+  Elements.push_back(MDef2);
+  auto MN2 = MDTuple::get(Ctx, Elements);
+  EXPECT_EQ(MN2, MF2->getRawElements());
+  EXPECT_TRUE(verifyModule(*M));
+}
 }
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 2ffedab82acb..6068de5514c7 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -43,10 +43,11 @@ add_llvm_unittest(SupportTests
   SpecialCaseListTest.cpp
   StringPool.cpp
   SwapByteOrderTest.cpp
+  TarWriterTest.cpp
   TargetParserTest.cpp
-  Threading.cpp
   ThreadLocalTest.cpp
   ThreadPool.cpp
+  Threading.cpp
   TimerTest.cpp
   TypeNameTest.cpp
   TrailingObjectsTest.cpp
diff --git a/unittests/Support/TarWriterTest.cpp b/unittests/Support/TarWriterTest.cpp
new file mode 100644
index 000000000000..84005de56c12
--- /dev/null
+++ b/unittests/Support/TarWriterTest.cpp
@@ -0,0 +1,88 @@
+//===- llvm/unittest/Support/TarWriterTest.cpp ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TarWriter.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+namespace {
+
+struct UstarHeader {
+  char Name[100];
+  char Mode[8];
+  char Uid[8];
+  char Gid[8];
+  char Size[12];
+  char Mtime[12];
+  char Checksum[8];
+  char TypeFlag;
+  char Linkname[100];
+  char Magic[6];
+  char Version[2];
+  char Uname[32];
+  char Gname[32];
+  char DevMajor[8];
+  char DevMinor[8];
+  char Prefix[155];
+  char Pad[12];
+};
+
+class TarWriterTest : public ::testing::Test {};
+
+static UstarHeader create(StringRef Base, StringRef Filename) {
+  // Create a temporary file.
+  SmallString<128> Path;
+  std::error_code EC =
+      sys::fs::createTemporaryFile("TarWriterTest", "tar", Path);
+  EXPECT_FALSE((bool)EC);
+
+  // Create a tar file.
+  Expected<std::unique_ptr<TarWriter>> TarOrErr = TarWriter::create(Path, Base);
+  EXPECT_TRUE((bool)TarOrErr);
+  std::unique_ptr<TarWriter> Tar = std::move(*TarOrErr);
+  Tar->append(Filename, "contents");
+  Tar.reset();
+
+  // Read the tar file.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getFile(Path);
+  EXPECT_TRUE((bool)MBOrErr);
+  std::unique_ptr<MemoryBuffer> MB = std::move(*MBOrErr);
+  sys::fs::remove(Path);
+  return *reinterpret_cast<const UstarHeader *>(MB->getBufferStart());
+}
+
+TEST_F(TarWriterTest, Basics) {
+  UstarHeader Hdr = create("base", "file");
+  EXPECT_EQ("ustar", StringRef(Hdr.Magic));
+  EXPECT_EQ("00", StringRef(Hdr.Version, 2));
+  EXPECT_EQ("base/file", StringRef(Hdr.Name));
+  EXPECT_EQ("00000000010", StringRef(Hdr.Size));
+}
+
+TEST_F(TarWriterTest, LongFilename) {
+  UstarHeader Hdr1 = create(
+      "012345678", std::string(99, 'x') + "/" + std::string(44, 'x') + "/foo");
+  EXPECT_EQ("foo", StringRef(Hdr1.Name));
+  EXPECT_EQ("012345678/" + std::string(99, 'x') + "/" + std::string(44, 'x'),
+            StringRef(Hdr1.Prefix));
+
+  UstarHeader Hdr2 = create(
+      "012345678", std::string(99, 'x') + "/" + std::string(45, 'x') + "/foo");
+  EXPECT_EQ("foo", StringRef(Hdr2.Name));
+  EXPECT_EQ("012345678/" + std::string(99, 'x') + "/" + std::string(45, 'x'),
+            StringRef(Hdr2.Prefix));
+
+  UstarHeader Hdr3 = create(
+      "012345678", std::string(99, 'x') + "/" + std::string(46, 'x') + "/foo");
+  EXPECT_EQ(std::string(46, 'x') + "/foo", StringRef(Hdr3.Name));
+  EXPECT_EQ("012345678/" + std::string(99, 'x'), StringRef(Hdr3.Prefix));
+}
+}
diff --git a/unittests/Transforms/CMakeLists.txt b/unittests/Transforms/CMakeLists.txt
index 5d3b29c94d72..e2570a3b6537 100644
--- a/unittests/Transforms/CMakeLists.txt
+++ b/unittests/Transforms/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IPO)
+add_subdirectory(Scalar)
 add_subdirectory(Utils)
diff --git a/unittests/Transforms/Scalar/CMakeLists.txt b/unittests/Transforms/Scalar/CMakeLists.txt
new file mode 100644
index 000000000000..2762799d6124
--- /dev/null
+++ b/unittests/Transforms/Scalar/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  AsmParser
+  Core
+  Support
+  ScalarOpts
+  TransformUtils
+  )
+
+add_llvm_unittest(ScalarTests
+  LoopPassManagerTest.cpp
+  )
diff --git a/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
new file mode 100644
index 000000000000..a099e35c7f19
--- /dev/null
+++ b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -0,0 +1,1438 @@
+//===- llvm/unittest/Analysis/LoopPassManagerTest.cpp - LPM tests ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+using testing::DoDefault;
+using testing::Return;
+using testing::Expectation;
+using testing::Invoke;
+using testing::InvokeWithoutArgs;
+using testing::_;
+
+template <typename DerivedT, typename IRUnitT,
+          typename AnalysisManagerT = AnalysisManager<IRUnitT>,
+          typename... ExtraArgTs>
+class MockAnalysisHandleBase {
+public:
+  class Analysis : public AnalysisInfoMixin<Analysis> {
+    friend AnalysisInfoMixin<Analysis>;
+    friend MockAnalysisHandleBase;
+    static AnalysisKey Key;
+
+    DerivedT *Handle;
+
+    Analysis(DerivedT &Handle) : Handle(&Handle) {}
+
+  public:
+    class Result {
+      friend MockAnalysisHandleBase;
+
+      DerivedT *Handle;
+
+      Result(DerivedT &Handle) : Handle(&Handle) {}
+
+    public:
+      // Forward invalidation events to the mock handle.
+      bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA,
+                      typename AnalysisManagerT::Invalidator &Inv) {
+        return Handle->invalidate(IR, PA, Inv);
+      }
+    };
+
+    Result run(IRUnitT &IR, AnalysisManagerT &AM, ExtraArgTs... ExtraArgs) {
+      return Handle->run(IR, AM, ExtraArgs...);
+    }
+  };
+
+  Analysis getAnalysis() { return Analysis(static_cast<DerivedT &>(*this)); }
+  typename Analysis::Result getResult() {
+    return typename Analysis::Result(static_cast<DerivedT &>(*this));
+  }
+
+protected:
+  // FIXME: MSVC seems unable to handle a lambda argument to Invoke from within
+  // the template, so we use a boring static function.
+  static bool invalidateCallback(IRUnitT &IR, const PreservedAnalyses &PA,
+                                 typename AnalysisManagerT::Invalidator &Inv) {
+    auto PAC = PA.template getChecker<Analysis>();
+    return !PAC.preserved() &&
+           !PAC.template preservedSet<AllAnalysesOn<IRUnitT>>();
+  }
+
+  /// Derived classes should call this in their constructor to set up default
+  /// mock actions. (We can't do this in our constructor because this has to
+  /// run after the DerivedT is constructed.)
+  void setDefaults() {
+    ON_CALL(static_cast<DerivedT &>(*this),
+            run(_, _, testing::Matcher<ExtraArgTs>(_)...))
+        .WillByDefault(Return(this->getResult()));
+    ON_CALL(static_cast<DerivedT &>(*this), invalidate(_, _, _))
+        .WillByDefault(Invoke(&invalidateCallback));
+  }
+};
+
+template <typename DerivedT, typename IRUnitT, typename AnalysisManagerT,
+          typename... ExtraArgTs>
+AnalysisKey MockAnalysisHandleBase<DerivedT, IRUnitT, AnalysisManagerT,
+                                   ExtraArgTs...>::Analysis::Key;
+
+/// Mock handle for loop analyses.
+///
+/// This is provided as a template accepting an (optional) integer. Because
+/// analyses are identified and queried by type, this allows constructing
+/// multiple handles with distinctly typed nested 'Analysis' types that can be
+/// registered and queried. If you want to register multiple loop analysis
+/// passes, you'll need to instantiate this type with different values for I.
+/// For example:
+///
+///   MockLoopAnalysisHandleTemplate<0> h0;
+///   MockLoopAnalysisHandleTemplate<1> h1;
+///   typedef decltype(h0)::Analysis Analysis0;
+///   typedef decltype(h1)::Analysis Analysis1;
+template <size_t I = static_cast<size_t>(-1)>
+struct MockLoopAnalysisHandleTemplate
+    : MockAnalysisHandleBase<MockLoopAnalysisHandleTemplate<I>, Loop,
+                             LoopAnalysisManager,
+                             LoopStandardAnalysisResults &> {
+  typedef typename MockLoopAnalysisHandleTemplate::Analysis Analysis;
+
+  MOCK_METHOD3_T(run, typename Analysis::Result(Loop &, LoopAnalysisManager &,
+                                                LoopStandardAnalysisResults &));
+
+  MOCK_METHOD3_T(invalidate, bool(Loop &, const PreservedAnalyses &,
+                                  LoopAnalysisManager::Invalidator &));
+
+  MockLoopAnalysisHandleTemplate() { this->setDefaults(); }
+};
+
+typedef MockLoopAnalysisHandleTemplate<> MockLoopAnalysisHandle;
+
+struct MockFunctionAnalysisHandle
+    : MockAnalysisHandleBase<MockFunctionAnalysisHandle, Function> {
+  MOCK_METHOD2(run, Analysis::Result(Function &, FunctionAnalysisManager &));
+
+  MOCK_METHOD3(invalidate, bool(Function &, const PreservedAnalyses &,
+                                FunctionAnalysisManager::Invalidator &));
+
+  MockFunctionAnalysisHandle() { setDefaults(); }
+};
+
+template <typename DerivedT, typename IRUnitT,
+          typename AnalysisManagerT = AnalysisManager<IRUnitT>,
+          typename... ExtraArgTs>
+class MockPassHandleBase {
+public:
+  class Pass : public PassInfoMixin<Pass> {
+    friend MockPassHandleBase;
+
+    DerivedT *Handle;
+
+    Pass(DerivedT &Handle) : Handle(&Handle) {}
+
+  public:
+    PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
+                          ExtraArgTs... ExtraArgs) {
+      return Handle->run(IR, AM, ExtraArgs...);
+    }
+  };
+
+  Pass getPass() { return Pass(static_cast<DerivedT &>(*this)); }
+
+protected:
+  /// Derived classes should call this in their constructor to set up default
+  /// mock actions. (We can't do this in our constructor because this has to
+  /// run after the DerivedT is constructed.)
+  void setDefaults() {
+    ON_CALL(static_cast<DerivedT &>(*this),
+            run(_, _, testing::Matcher<ExtraArgTs>(_)...))
+        .WillByDefault(Return(PreservedAnalyses::all()));
+  }
+};
+
+struct MockLoopPassHandle
+    : MockPassHandleBase<MockLoopPassHandle, Loop, LoopAnalysisManager,
+                         LoopStandardAnalysisResults &, LPMUpdater &> {
+  MOCK_METHOD4(run,
+               PreservedAnalyses(Loop &, LoopAnalysisManager &,
+                                 LoopStandardAnalysisResults &, LPMUpdater &));
+  MockLoopPassHandle() { setDefaults(); }
+};
+
+struct MockFunctionPassHandle
+    : MockPassHandleBase<MockFunctionPassHandle, Function> {
+  MOCK_METHOD2(run, PreservedAnalyses(Function &, FunctionAnalysisManager &));
+
+  MockFunctionPassHandle() { setDefaults(); }
+};
+
+struct MockModulePassHandle : MockPassHandleBase<MockModulePassHandle, Module> {
+  MOCK_METHOD2(run, PreservedAnalyses(Module &, ModuleAnalysisManager &));
+
+  MockModulePassHandle() { setDefaults(); }
+};
+
+/// Define a custom matcher for objects which support a 'getName' method
+/// returning a StringRef.
+///
+/// LLVM often has IR objects or analysis objects which expose a StringRef name
+/// and in tests it is convenient to match these by name for readability. This
+/// matcher supports any type exposing a getName() method of this form.
+///
+/// It should be used as:
+///
+///   HasName("my_function")
+///
+/// No namespace or other qualification is required.
+MATCHER_P(HasName, Name, "") {
+  // The matcher's name and argument are printed in the case of failure, but we
+  // also want to print out the name of the argument. This uses an implicitly
+  // avaiable std::ostream, so we have to construct a std::string.
+  *result_listener << "has name '" << arg.getName().str() << "'";
+  return Name == arg.getName();
+}
+
+std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
+  SMDiagnostic Err;
+  return parseAssemblyString(IR, Err, C);
+}
+
+class LoopPassManagerTest : public ::testing::Test {
+protected:
+  LLVMContext Context;
+  std::unique_ptr<Module> M;
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  ModuleAnalysisManager MAM;
+
+  MockLoopAnalysisHandle MLAHandle;
+  MockLoopPassHandle MLPHandle;
+  MockFunctionPassHandle MFPHandle;
+  MockModulePassHandle MMPHandle;
+
+  static PreservedAnalyses
+  getLoopAnalysisResult(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &) {
+    (void)AM.getResult<MockLoopAnalysisHandle::Analysis>(L, AR);
+    return PreservedAnalyses::all();
+  };
+
+public:
+  LoopPassManagerTest()
+      : M(parseIR(Context, "define void @f() {\n"
+                           "entry:\n"
+                           "  br label %loop.0\n"
+                           "loop.0:\n"
+                           "  br i1 undef, label %loop.0.0, label %end\n"
+                           "loop.0.0:\n"
+                           "  br i1 undef, label %loop.0.0, label %loop.0.1\n"
+                           "loop.0.1:\n"
+                           "  br i1 undef, label %loop.0.1, label %loop.0\n"
+                           "end:\n"
+                           "  ret void\n"
+                           "}\n"
+                           "\n"
+                           "define void @g() {\n"
+                           "entry:\n"
+                           "  br label %loop.g.0\n"
+                           "loop.g.0:\n"
+                           "  br i1 undef, label %loop.g.0, label %end\n"
+                           "end:\n"
+                           "  ret void\n"
+                           "}\n")),
+        LAM(true), FAM(true), MAM(true) {
+    // Register our mock analysis.
+    LAM.registerPass([&] { return MLAHandle.getAnalysis(); });
+
+    // We need DominatorTreeAnalysis for LoopAnalysis.
+    FAM.registerPass([&] { return DominatorTreeAnalysis(); });
+    FAM.registerPass([&] { return LoopAnalysis(); });
+    // We also allow loop passes to assume a set of other analyses and so need
+    // those.
+    FAM.registerPass([&] { return AAManager(); });
+    FAM.registerPass([&] { return AssumptionAnalysis(); });
+    FAM.registerPass([&] { return ScalarEvolutionAnalysis(); });
+    FAM.registerPass([&] { return TargetLibraryAnalysis(); });
+    FAM.registerPass([&] { return TargetIRAnalysis(); });
+
+    // Cross-register proxies.
+    LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+    FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
+    FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
+    MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
+  }
+};
+
+TEST_F(LoopPassManagerTest, Basic) {
+  ModulePassManager MPM(true);
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // First we just visit all the loops in all the functions and get their
+  // analysis results. This will run the analysis a total of four times,
+  // once for each loop.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.g.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  // Wire the loop pass through pass managers into the module pipeline.
+  {
+    LoopPassManager LPM(true);
+    LPM.addPass(MLPHandle.getPass());
+    FunctionPassManager FPM(true);
+    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  // Next we run two passes over the loops. The first one invalidates the
+  // analyses for one loop, the second ones try to get the analysis results.
+  // This should force only one analysis to re-run within the loop PM, but will
+  // also invalidate everything after the loop pass manager finishes.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(DoDefault())
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(InvokeWithoutArgs([] { return PreservedAnalyses::none(); }))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(DoDefault())
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.g.0"), _, _, _))
+      .WillOnce(DoDefault())
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  // Wire two loop pass runs into the module pipeline.
+  {
+    LoopPassManager LPM(true);
+    LPM.addPass(MLPHandle.getPass());
+    LPM.addPass(MLPHandle.getPass());
+    FunctionPassManager FPM(true);
+    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  // And now run the pipeline across the module.
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, FunctionPassInvalidationOfLoopAnalyses) {
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  // We process each function completely in sequence.
+  ::testing::Sequence FSequence, GSequence;
+
+  // First, force the analysis result to be computed for each loop.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _))
+      .InSequence(GSequence)
+      .WillOnce(DoDefault());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // No need to re-run if we require again from a fresh loop pass manager.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // For 'f', preserve most things but not the specific loop analyses.
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _))
+      .InSequence(FSequence)
+      .WillOnce(Return(getLoopPassPreservedAnalyses()));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  // On one loop, skip the invalidation (as though we did an internal update).
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.1"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(Return(false));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  // Now two loops still have to be recomputed.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  // Preserve things in the second function to ensure invalidation remains
+  // isolated to one function.
+  EXPECT_CALL(MFPHandle, run(HasName("g"), _))
+      .InSequence(GSequence)
+      .WillOnce(DoDefault());
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _))
+      .InSequence(FSequence)
+      .WillOnce(DoDefault());
+  // For 'g', fail to preserve anything, causing the loops themselves to be
+  // cleared. We don't get an invalidation event here as the loop is gone, but
+  // we should still have to recompute the analysis.
+  EXPECT_CALL(MFPHandle, run(HasName("g"), _))
+      .InSequence(GSequence)
+      .WillOnce(Return(PreservedAnalyses::none()));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _))
+      .InSequence(GSequence)
+      .WillOnce(DoDefault());
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  // Verify with a separate function pass run that we didn't mess up 'f's
+  // cache. No analysis runs should be necessary here.
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, ModulePassInvalidationOfLoopAnalyses) {
+  ModulePassManager MPM(true);
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // First, force the analysis result to be computed for each loop.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // Walking all the way out and all the way back in doesn't re-run the
+  // analysis.
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // But a module pass that doesn't preserve the actual mock loop analysis
+  // invalidates all the way down and forces recomputing.
+  EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = getLoopPassPreservedAnalyses();
+    PA.preserve<FunctionAnalysisManagerModuleProxy>();
+    return PA;
+  }));
+  // All the loop analyses from both functions get invalidated before we
+  // recompute anything.
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.0"), _, _));
+  // On one loop, again skip the invalidation (as though we did an internal
+  // update).
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.1"), _, _))
+      .WillOnce(Return(false));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.g.0"), _, _));
+  // Now all but one of the loops gets re-analyzed.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  MPM.addPass(MMPHandle.getPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // Verify that the cached values persist.
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // Now we fail to preserve the loop analysis and observe that the loop
+  // analyses are cleared (so no invalidation event) as the loops themselves
+  // are no longer valid.
+  EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<FunctionAnalysisManagerModuleProxy>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  MPM.addPass(MMPHandle.getPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // Verify that the cached values persist.
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  // Next, check that even if we preserve everything within the function itelf,
+  // if the function's module pass proxy isn't preserved and the potential set
+  // of functions changes, the clear reaches the loop analyses as well. This
+  // will again trigger re-runs but not invalidation events.
+  EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserveSet<AllAnalysesOn<Function>>();
+    PA.preserveSet<AllAnalysesOn<Loop>>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  MPM.addPass(MMPHandle.getPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>())));
+
+  MPM.run(*M, MAM);
+}
+
+// Test that if any of the bundled analyses provided in the LPM's signature
+// become invalid, the analysis proxy itself becomes invalid and we clear all
+// loop analysis results.
+TEST_F(LoopPassManagerTest, InvalidationOfBundledAnalyses) {
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // First, force the analysis result to be computed for each loop.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // No need to re-run if we require again from a fresh loop pass manager.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // Preserving everything but the loop analyses themselves results in
+  // invalidation and running.
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _))
+      .WillOnce(Return(getLoopPassPreservedAnalyses()));
+  EXPECT_CALL(MLAHandle, invalidate(_, _, _)).Times(3);
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // The rest don't invalidate analyses, they only trigger re-runs because we
+  // clear the cache completely.
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    // Not preserving `AAManager`.
+    PA.preserve<AssumptionAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<AAManager>();
+    // Not preserving `AssumptionAnalysis`.
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<AAManager>();
+    PA.preserve<AssumptionAnalysis>();
+    // Not preserving `DominatorTreeAnalysis`.
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<AAManager>();
+    PA.preserve<AssumptionAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    // Not preserving the `LoopAnalysis`.
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<AAManager>();
+    PA.preserve<AssumptionAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    // Not preserving the `LoopAnalysisManagerFunctionProxy`.
+    PA.preserve<ScalarEvolutionAnalysis>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = PreservedAnalyses::none();
+    PA.preserve<AAManager>();
+    PA.preserve<AssumptionAnalysis>();
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<LoopAnalysis>();
+    PA.preserve<LoopAnalysisManagerFunctionProxy>();
+    // Not preserving `ScalarEvolutionAnalysis`.
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
+
+  // After all the churn on 'f', we'll compute the loop analysis results for
+  // 'g' once with a requires pass and then run our mock pass over g a bunch
+  // but just get cached results each time.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+  EXPECT_CALL(MFPHandle, run(HasName("g"), _)).Times(7);
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, IndirectInvalidation) {
+  // We need two distinct analysis types and handles.
+  enum { A, B };
+  MockLoopAnalysisHandleTemplate<A> MLAHandleA;
+  MockLoopAnalysisHandleTemplate<B> MLAHandleB;
+  LAM.registerPass([&] { return MLAHandleA.getAnalysis(); });
+  LAM.registerPass([&] { return MLAHandleB.getAnalysis(); });
+  typedef decltype(MLAHandleA)::Analysis AnalysisA;
+  typedef decltype(MLAHandleB)::Analysis AnalysisB;
+
+  // Set up AnalysisA to depend on our AnalysisB. For testing purposes we just
+  // need to get the AnalysisB results in AnalysisA's run method and check if
+  // AnalysisB gets invalidated in AnalysisA's invalidate method.
+  ON_CALL(MLAHandleA, run(_, _, _))
+      .WillByDefault(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR) {
+        (void)AM.getResult<AnalysisB>(L, AR);
+        return MLAHandleA.getResult();
+      }));
+  ON_CALL(MLAHandleA, invalidate(_, _, _))
+      .WillByDefault(Invoke([](Loop &L, const PreservedAnalyses &PA,
+                               LoopAnalysisManager::Invalidator &Inv) {
+        auto PAC = PA.getChecker<AnalysisA>();
+        return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Loop>>()) ||
+               Inv.invalidate<AnalysisB>(L, PA);
+      }));
+
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // Compute the analyses across all of 'f' first.
+  EXPECT_CALL(MLAHandleA, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandleB, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandleA, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandleB, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandleA, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLAHandleB, run(HasName("loop.0"), _, _));
+
+  // Now we invalidate AnalysisB (but not AnalysisA) for one of the loops and
+  // preserve everything for the rest. This in turn triggers that one loop to
+  // recompute both AnalysisB *and* AnalysisA if indirect invalidation is
+  // working.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(InvokeWithoutArgs([] {
+        auto PA = getLoopPassPreservedAnalyses();
+        // Specifically preserve AnalysisA so that it would survive if it
+        // didn't depend on AnalysisB.
+        PA.preserve<AnalysisA>();
+        return PA;
+      }));
+  // It happens that AnalysisB is invalidated first. That shouldn't matter
+  // though, and we should still call AnalysisA's invalidation.
+  EXPECT_CALL(MLAHandleB, invalidate(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandleA, invalidate(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke([](Loop &L, LoopAnalysisManager &AM,
+                          LoopStandardAnalysisResults &AR, LPMUpdater &) {
+        (void)AM.getResult<AnalysisA>(L, AR);
+        return PreservedAnalyses::all();
+      }));
+  EXPECT_CALL(MLAHandleA, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandleB, run(HasName("loop.0.0"), _, _));
+  // The rest of the loops should run and get cached results.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke([](Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
+        (void)AM.getResult<AnalysisA>(L, AR);
+        return PreservedAnalyses::all();
+      }));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke([](Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
+        (void)AM.getResult<AnalysisA>(L, AR);
+        return PreservedAnalyses::all();
+      }));
+
+  // The run over 'g' should be boring, with us just computing the analyses once
+  // up front and then running loop passes and getting cached results.
+  EXPECT_CALL(MLAHandleA, run(HasName("loop.g.0"), _, _));
+  EXPECT_CALL(MLAHandleB, run(HasName("loop.g.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.g.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke([](Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
+        (void)AM.getResult<AnalysisA>(L, AR);
+        return PreservedAnalyses::all();
+      }));
+
+  // Build the pipeline and run it.
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  FPM.addPass(
+      createFunctionToLoopPassAdaptor(RequireAnalysisLoopPass<AnalysisA>()));
+  LoopPassManager LPM(true);
+  LPM.addPass(MLPHandle.getPass());
+  LPM.addPass(MLPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
+  typedef decltype(MLAHandle)::Analysis LoopAnalysis;
+
+  MockFunctionAnalysisHandle MFAHandle;
+  FAM.registerPass([&] { return MFAHandle.getAnalysis(); });
+  typedef decltype(MFAHandle)::Analysis FunctionAnalysis;
+
+  // Set up the loop analysis to depend on both the function and module
+  // analysis.
+  ON_CALL(MLAHandle, run(_, _, _))
+      .WillByDefault(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR) {
+        auto &FAMP = AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR);
+        auto &FAM = FAMP.getManager();
+        Function &F = *L.getHeader()->getParent();
+        if (FAM.getCachedResult<FunctionAnalysis>(F))
+          FAMP.registerOuterAnalysisInvalidation<FunctionAnalysis,
+                                                 LoopAnalysis>();
+        return MLAHandle.getResult();
+      }));
+
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // Compute the analyses across all of 'f' first.
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _))
+      .WillOnce(Invoke([](Function &F, FunctionAnalysisManager &AM) {
+        // Force the computing of the function analysis so it is available in
+        // this function.
+        (void)AM.getResult<FunctionAnalysis>(F);
+        return PreservedAnalyses::all();
+      }));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+
+  // Now invalidate the function analysis but preserve the loop analyses.
+  // This should trigger immediate invalidation of the loop analyses, despite
+  // the fact that they were preserved.
+  EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = getLoopPassPreservedAnalyses();
+    PA.preserveSet<AllAnalysesOn<Loop>>();
+    return PA;
+  }));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0"), _, _));
+
+  // And re-running a requires pass recomputes them.
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+
+  // When we run over 'g' we don't populate the cache with the function
+  // analysis.
+  EXPECT_CALL(MFPHandle, run(HasName("g"), _))
+      .WillOnce(Return(PreservedAnalyses::all()));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.g.0"), _, _));
+
+  // Which means that no extra invalidation occurs and cached values are used.
+  EXPECT_CALL(MFPHandle, run(HasName("g"), _)).WillOnce(InvokeWithoutArgs([] {
+    auto PA = getLoopPassPreservedAnalyses();
+    PA.preserveSet<AllAnalysesOn<Loop>>();
+    return PA;
+  }));
+
+  // Build the pipeline and run it.
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(
+      createFunctionToLoopPassAdaptor(RequireAnalysisLoopPass<LoopAnalysis>()));
+  FPM.addPass(MFPHandle.getPass());
+  FPM.addPass(
+      createFunctionToLoopPassAdaptor(RequireAnalysisLoopPass<LoopAnalysis>()));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, LoopChildInsertion) {
+  // Super boring module with three loops in a single loop nest.
+  M = parseIR(Context, "define void @f() {\n"
+                       "entry:\n"
+                       "  br label %loop.0\n"
+                       "loop.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %end\n"
+                       "loop.0.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %loop.0.1\n"
+                       "loop.0.1:\n"
+                       "  br i1 undef, label %loop.0.1, label %loop.0.2\n"
+                       "loop.0.2:\n"
+                       "  br i1 undef, label %loop.0.2, label %loop.0\n"
+                       "end:\n"
+                       "  ret void\n"
+                       "}\n");
+
+  // Build up variables referring into the IR so we can rewrite it below
+  // easily.
+  Function &F = *M->begin();
+  ASSERT_THAT(F, HasName("f"));
+  auto BBI = F.begin();
+  BasicBlock &EntryBB = *BBI++;
+  ASSERT_THAT(EntryBB, HasName("entry"));
+  BasicBlock &Loop0BB = *BBI++;
+  ASSERT_THAT(Loop0BB, HasName("loop.0"));
+  BasicBlock &Loop00BB = *BBI++;
+  ASSERT_THAT(Loop00BB, HasName("loop.0.0"));
+  BasicBlock &Loop01BB = *BBI++;
+  ASSERT_THAT(Loop01BB, HasName("loop.0.1"));
+  BasicBlock &Loop02BB = *BBI++;
+  ASSERT_THAT(Loop02BB, HasName("loop.0.2"));
+  BasicBlock &EndBB = *BBI++;
+  ASSERT_THAT(EndBB, HasName("end"));
+  ASSERT_THAT(BBI, F.end());
+
+  // Build the pass managers and register our pipeline. We build a single loop
+  // pass pipeline consisting of three mock pass runs over each loop. After
+  // this we run both domtree and loop verification passes to make sure that
+  // the IR remained valid during our mutations.
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  LoopPassManager LPM(true);
+  LPM.addPass(MLPHandle.getPass());
+  LPM.addPass(MLPHandle.getPass());
+  LPM.addPass(MLPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+  FPM.addPass(DominatorTreeVerifierPass());
+  FPM.addPass(LoopVerifierPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  // All the visit orders are deterministic, so we use simple fully order
+  // expectations.
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // We run loop passes three times over each of the loops.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+
+  // When running over the middle loop, the second run inserts two new child
+  // loops, inserting them and itself into the worklist.
+  BasicBlock *NewLoop010BB;
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                           LoopStandardAnalysisResults &AR,
+                           LPMUpdater &Updater) {
+        auto *NewLoop = new Loop();
+        L.addChildLoop(NewLoop);
+        NewLoop010BB = BasicBlock::Create(Context, "loop.0.1.0", &F, &Loop02BB);
+        BranchInst::Create(&Loop01BB, NewLoop010BB,
+                           UndefValue::get(Type::getInt1Ty(Context)),
+                           NewLoop010BB);
+        Loop01BB.getTerminator()->replaceUsesOfWith(&Loop01BB, NewLoop010BB);
+        AR.DT.addNewBlock(NewLoop010BB, &Loop01BB);
+        NewLoop->addBasicBlockToLoop(NewLoop010BB, AR.LI);
+        Updater.addChildLoops({NewLoop});
+        return PreservedAnalyses::all();
+      }));
+
+  // We should immediately drop down to fully visit the new inner loop.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // After visiting the inner loop, we should re-visit the second loop
+  // reflecting its new loop nest structure.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+
+  // In the second run over the middle loop after we've visited the new child,
+  // we add another child to check that we can repeatedly add children, and add
+  // children to a loop that already has children.
+  BasicBlock *NewLoop011BB;
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                           LoopStandardAnalysisResults &AR,
+                           LPMUpdater &Updater) {
+        auto *NewLoop = new Loop();
+        L.addChildLoop(NewLoop);
+        NewLoop011BB = BasicBlock::Create(Context, "loop.0.1.1", &F, &Loop02BB);
+        BranchInst::Create(&Loop01BB, NewLoop011BB,
+                           UndefValue::get(Type::getInt1Ty(Context)),
+                           NewLoop011BB);
+        NewLoop010BB->getTerminator()->replaceUsesOfWith(&Loop01BB,
+                                                         NewLoop011BB);
+        AR.DT.addNewBlock(NewLoop011BB, NewLoop010BB);
+        NewLoop->addBasicBlockToLoop(NewLoop011BB, AR.LI);
+        Updater.addChildLoops({NewLoop});
+        return PreservedAnalyses::all();
+      }));
+
+  // Again, we should immediately drop down to visit the new, unvisited child
+  // loop. We don't need to revisit the other child though.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1.1"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // And now we should pop back up to the second loop and do a full pipeline of
+  // three passes on its current form.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .Times(3)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.2"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // Now that all the expected actions are registered, run the pipeline over
+  // our module. All of our expectations are verified when the test finishes.
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
+  // Super boring module with two loop nests and loop nest with two child
+  // loops.
+  M = parseIR(Context, "define void @f() {\n"
+                       "entry:\n"
+                       "  br label %loop.0\n"
+                       "loop.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %loop.2\n"
+                       "loop.0.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %loop.0.2\n"
+                       "loop.0.2:\n"
+                       "  br i1 undef, label %loop.0.2, label %loop.0\n"
+                       "loop.2:\n"
+                       "  br i1 undef, label %loop.2, label %end\n"
+                       "end:\n"
+                       "  ret void\n"
+                       "}\n");
+
+  // Build up variables referring into the IR so we can rewrite it below
+  // easily.
+  Function &F = *M->begin();
+  ASSERT_THAT(F, HasName("f"));
+  auto BBI = F.begin();
+  BasicBlock &EntryBB = *BBI++;
+  ASSERT_THAT(EntryBB, HasName("entry"));
+  BasicBlock &Loop0BB = *BBI++;
+  ASSERT_THAT(Loop0BB, HasName("loop.0"));
+  BasicBlock &Loop00BB = *BBI++;
+  ASSERT_THAT(Loop00BB, HasName("loop.0.0"));
+  BasicBlock &Loop02BB = *BBI++;
+  ASSERT_THAT(Loop02BB, HasName("loop.0.2"));
+  BasicBlock &Loop2BB = *BBI++;
+  ASSERT_THAT(Loop2BB, HasName("loop.2"));
+  BasicBlock &EndBB = *BBI++;
+  ASSERT_THAT(EndBB, HasName("end"));
+  ASSERT_THAT(BBI, F.end());
+  Constant *Undefi1 = UndefValue::get(Type::getInt1Ty(Context));
+
+  // Build the pass managers and register our pipeline. We build a single loop
+  // pass pipeline consisting of three mock pass runs over each loop. After
+  // this we run both domtree and loop verification passes to make sure that
+  // the IR remained valid during our mutations.
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  LoopPassManager LPM(true);
+  LPM.addPass(MLPHandle.getPass());
+  LPM.addPass(MLPHandle.getPass());
+  LPM.addPass(MLPHandle.getPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+  FPM.addPass(DominatorTreeVerifierPass());
+  FPM.addPass(LoopVerifierPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  // All the visit orders are deterministic, so we use simple fully order
+  // expectations.
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // We run loop passes three times over each of the loops.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+
+  // On the second run, we insert a sibling loop.
+  BasicBlock *NewLoop01BB;
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                           LoopStandardAnalysisResults &AR,
+                           LPMUpdater &Updater) {
+        auto *NewLoop = new Loop();
+        L.getParentLoop()->addChildLoop(NewLoop);
+        NewLoop01BB = BasicBlock::Create(Context, "loop.0.1", &F, &Loop02BB);
+        BranchInst::Create(&Loop02BB, NewLoop01BB, Undefi1, NewLoop01BB);
+        Loop00BB.getTerminator()->replaceUsesOfWith(&Loop02BB, NewLoop01BB);
+        auto *NewDTNode = AR.DT.addNewBlock(NewLoop01BB, &Loop00BB);
+        AR.DT.changeImmediateDominator(AR.DT[&Loop02BB], NewDTNode);
+        NewLoop->addBasicBlockToLoop(NewLoop01BB, AR.LI);
+        Updater.addSiblingLoops({NewLoop});
+        return PreservedAnalyses::all();
+      }));
+  // We finish processing this loop as sibling loops don't perturb the
+  // postorder walk.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+
+  // We visit the inserted sibling next.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.2"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  // Next, on the third pass run on the last inner loop we add more new
+  // siblings, more than one, and one with nested child loops. By doing this at
+  // the end we make sure that edge case works well.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                           LoopStandardAnalysisResults &AR,
+                           LPMUpdater &Updater) {
+        Loop *NewLoops[] = {new Loop(), new Loop(), new Loop()};
+        L.getParentLoop()->addChildLoop(NewLoops[0]);
+        L.getParentLoop()->addChildLoop(NewLoops[1]);
+        NewLoops[1]->addChildLoop(NewLoops[2]);
+        auto *NewLoop03BB =
+            BasicBlock::Create(Context, "loop.0.3", &F, &Loop2BB);
+        auto *NewLoop04BB =
+            BasicBlock::Create(Context, "loop.0.4", &F, &Loop2BB);
+        auto *NewLoop040BB =
+            BasicBlock::Create(Context, "loop.0.4.0", &F, &Loop2BB);
+        Loop02BB.getTerminator()->replaceUsesOfWith(&Loop0BB, NewLoop03BB);
+        BranchInst::Create(NewLoop04BB, NewLoop03BB, Undefi1, NewLoop03BB);
+        BranchInst::Create(&Loop0BB, NewLoop040BB, Undefi1, NewLoop04BB);
+        BranchInst::Create(NewLoop04BB, NewLoop040BB, Undefi1, NewLoop040BB);
+        AR.DT.addNewBlock(NewLoop03BB, &Loop02BB);
+        AR.DT.addNewBlock(NewLoop04BB, NewLoop03BB);
+        AR.DT.addNewBlock(NewLoop040BB, NewLoop04BB);
+        NewLoops[0]->addBasicBlockToLoop(NewLoop03BB, AR.LI);
+        NewLoops[1]->addBasicBlockToLoop(NewLoop04BB, AR.LI);
+        NewLoops[2]->addBasicBlockToLoop(NewLoop040BB, AR.LI);
+        Updater.addSiblingLoops({NewLoops[0], NewLoops[1]});
+        return PreservedAnalyses::all();
+      }));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.3"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // Note that we need to visit the inner loop of this added sibling before the
+  // sibling itself!
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.4.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.4.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.4.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.4"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.4"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.4"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // And only now do we visit the outermost loop of the nest.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  // On the second pass, we add sibling loops which become new top-level loops.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                           LoopStandardAnalysisResults &AR,
+                           LPMUpdater &Updater) {
+        auto *NewLoop = new Loop();
+        AR.LI.addTopLevelLoop(NewLoop);
+        auto *NewLoop1BB = BasicBlock::Create(Context, "loop.1", &F, &Loop2BB);
+        BranchInst::Create(&Loop2BB, NewLoop1BB, Undefi1, NewLoop1BB);
+        Loop0BB.getTerminator()->replaceUsesOfWith(&Loop2BB, NewLoop1BB);
+        auto *NewDTNode = AR.DT.addNewBlock(NewLoop1BB, &Loop0BB);
+        AR.DT.changeImmediateDominator(AR.DT[&Loop2BB], NewDTNode);
+        NewLoop->addBasicBlockToLoop(NewLoop1BB, AR.LI);
+        Updater.addSiblingLoops({NewLoop});
+        return PreservedAnalyses::all();
+      }));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.1"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.2"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.2"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // Now that all the expected actions are registered, run the pipeline over
+  // our module. All of our expectations are verified when the test finishes.
+  MPM.run(*M, MAM);
+}
+
+TEST_F(LoopPassManagerTest, LoopDeletion) {
+  // Build a module with a single loop nest that contains one outer loop with
+  // three subloops, and one of those with its own subloop. We will
+  // incrementally delete all of these to test different deletion scenarios.
+  M = parseIR(Context, "define void @f() {\n"
+                       "entry:\n"
+                       "  br label %loop.0\n"
+                       "loop.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %end\n"
+                       "loop.0.0:\n"
+                       "  br i1 undef, label %loop.0.0, label %loop.0.1\n"
+                       "loop.0.1:\n"
+                       "  br i1 undef, label %loop.0.1, label %loop.0.2\n"
+                       "loop.0.2:\n"
+                       "  br i1 undef, label %loop.0.2.0, label %loop.0\n"
+                       "loop.0.2.0:\n"
+                       "  br i1 undef, label %loop.0.2.0, label %loop.0.2\n"
+                       "end:\n"
+                       "  ret void\n"
+                       "}\n");
+
+  // Build up variables referring into the IR so we can rewrite it below
+  // easily.
+  Function &F = *M->begin();
+  ASSERT_THAT(F, HasName("f"));
+  auto BBI = F.begin();
+  BasicBlock &EntryBB = *BBI++;
+  ASSERT_THAT(EntryBB, HasName("entry"));
+  BasicBlock &Loop0BB = *BBI++;
+  ASSERT_THAT(Loop0BB, HasName("loop.0"));
+  BasicBlock &Loop00BB = *BBI++;
+  ASSERT_THAT(Loop00BB, HasName("loop.0.0"));
+  BasicBlock &Loop01BB = *BBI++;
+  ASSERT_THAT(Loop01BB, HasName("loop.0.1"));
+  BasicBlock &Loop02BB = *BBI++;
+  ASSERT_THAT(Loop02BB, HasName("loop.0.2"));
+  BasicBlock &Loop020BB = *BBI++;
+  ASSERT_THAT(Loop020BB, HasName("loop.0.2.0"));
+  BasicBlock &EndBB = *BBI++;
+  ASSERT_THAT(EndBB, HasName("end"));
+  ASSERT_THAT(BBI, F.end());
+  Constant *Undefi1 = UndefValue::get(Type::getInt1Ty(Context));
+
+  // Helper to do the actual deletion of a loop. We directly encode this here
+  // to isolate ourselves from the rest of LLVM and for simplicity. Here we can
+  // egregiously cheat based on knowledge of the test case. For example, we
+  // have no PHI nodes and there is always a single i-dom.
+  auto DeleteLoopBlocks = [](Loop &L, BasicBlock &IDomBB,
+                             LoopStandardAnalysisResults &AR,
+                             LPMUpdater &Updater) {
+    for (BasicBlock *LoopBB : L.blocks()) {
+      SmallVector<DomTreeNode *, 4> ChildNodes(AR.DT[LoopBB]->begin(),
+                                               AR.DT[LoopBB]->end());
+      for (DomTreeNode *ChildNode : ChildNodes)
+        AR.DT.changeImmediateDominator(ChildNode, AR.DT[&IDomBB]);
+      AR.DT.eraseNode(LoopBB);
+      LoopBB->dropAllReferences();
+    }
+    SmallVector<BasicBlock *, 4> LoopBBs(L.block_begin(), L.block_end());
+    Updater.markLoopAsDeleted(L);
+    AR.LI.markAsRemoved(&L);
+    for (BasicBlock *LoopBB : LoopBBs)
+      LoopBB->eraseFromParent();
+  };
+
+  // Build up the pass managers.
+  ModulePassManager MPM(true);
+  FunctionPassManager FPM(true);
+  // We run several loop pass pipelines across the loop nest, but they all take
+  // the same form of three mock pass runs in a loop pipeline followed by
+  // domtree and loop verification. We use a lambda to stamp this out each
+  // time.
+  auto AddLoopPipelineAndVerificationPasses = [&] {
+    LoopPassManager LPM(true);
+    LPM.addPass(MLPHandle.getPass());
+    LPM.addPass(MLPHandle.getPass());
+    LPM.addPass(MLPHandle.getPass());
+    FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    FPM.addPass(DominatorTreeVerifierPass());
+    FPM.addPass(LoopVerifierPass());
+  };
+
+  // All the visit orders are deterministic so we use simple fully order
+  // expectations.
+  ::testing::InSequence MakeExpectationsSequenced;
+
+  // We run the loop pipeline with three passes over each of the loops. When
+  // running over the middle loop, the second pass in the pipeline deletes it.
+  // This should prevent the third pass from visiting it but otherwise leave
+  // the process unimpacted.
+  AddLoopPipelineAndVerificationPasses();
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.1"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.1"), _, _, _))
+      .WillOnce(
+          Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                     LoopStandardAnalysisResults &AR, LPMUpdater &Updater) {
+            AR.SE.forgetLoop(&L);
+            Loop00BB.getTerminator()->replaceUsesOfWith(&Loop01BB, &Loop02BB);
+            DeleteLoopBlocks(L, Loop00BB, AR, Updater);
+            return PreservedAnalyses::all();
+          }));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.2.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.2"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // Run the loop pipeline again. This time we delete the last loop, which
+  // contains a nested loop within it, and we reuse its inner loop object to
+  // insert a new loop into the nest. This makes sure that we don't reuse
+  // cached analysis results for loop objects when removed just because their
+  // pointers match, and that we can handle nested loop deletion.
+  AddLoopPipelineAndVerificationPasses();
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .Times(3)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2.0"), _, _, _))
+      .Times(3)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  BasicBlock *NewLoop03BB;
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.2"), _, _, _))
+      .WillOnce(
+          Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                     LoopStandardAnalysisResults &AR, LPMUpdater &Updater) {
+            // Delete the inner loop first. we also do this manually because we
+            // want to preserve the loop object and reuse it.
+            AR.SE.forgetLoop(*L.begin());
+            Loop02BB.getTerminator()->replaceUsesOfWith(&Loop020BB, &Loop02BB);
+            assert(std::next((*L.begin())->block_begin()) ==
+                       (*L.begin())->block_end() &&
+                   "There should only be one block.");
+            assert(AR.DT[&Loop020BB]->getNumChildren() == 0 &&
+                   "Cannot have children in the domtree!");
+            AR.DT.eraseNode(&Loop020BB);
+            Updater.markLoopAsDeleted(**L.begin());
+            AR.LI.removeBlock(&Loop020BB);
+            auto *OldL = L.removeChildLoop(L.begin());
+            Loop020BB.eraseFromParent();
+
+            auto *ParentL = L.getParentLoop();
+            AR.SE.forgetLoop(&L);
+            Loop00BB.getTerminator()->replaceUsesOfWith(&Loop02BB, &Loop0BB);
+            DeleteLoopBlocks(L, Loop00BB, AR, Updater);
+
+            // Now insert a new sibling loop, reusing a loop pointer.
+            ParentL->addChildLoop(OldL);
+            NewLoop03BB = BasicBlock::Create(Context, "loop.0.3", &F, &EndBB);
+            BranchInst::Create(&Loop0BB, NewLoop03BB, Undefi1, NewLoop03BB);
+            Loop00BB.getTerminator()->replaceUsesOfWith(&Loop0BB, NewLoop03BB);
+            AR.DT.addNewBlock(NewLoop03BB, &Loop00BB);
+            OldL->addBasicBlockToLoop(NewLoop03BB, AR.LI);
+            Updater.addSiblingLoops({OldL});
+            return PreservedAnalyses::all();
+          }));
+
+  // To respect our inner-to-outer traversal order, we must visit the
+  // newly-inserted sibling of the loop we just deleted before we visit the
+  // outer loop. When we do so, this must compute a fresh analysis result, even
+  // though our new loop has the same pointer value as the loop we deleted.
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLAHandle, run(HasName("loop.0.3"), _, _));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .Times(2)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .Times(3)
+      .WillRepeatedly(Invoke(getLoopAnalysisResult));
+
+  // In the final loop pipeline run we delete every loop, including the last
+  // loop of the nest. We do this again in the second pass in the pipeline, and
+  // as a consequence we never make it to three runs on any loop. We also cover
+  // deleting multiple loops in a single pipeline, deleting the first loop and
+  // deleting the (last) top level loop.
+  AddLoopPipelineAndVerificationPasses();
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.0"), _, _, _))
+      .WillOnce(
+          Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                     LoopStandardAnalysisResults &AR, LPMUpdater &Updater) {
+            AR.SE.forgetLoop(&L);
+            Loop0BB.getTerminator()->replaceUsesOfWith(&Loop00BB, NewLoop03BB);
+            DeleteLoopBlocks(L, Loop0BB, AR, Updater);
+            return PreservedAnalyses::all();
+          }));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0.3"), _, _, _))
+      .WillOnce(
+          Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                     LoopStandardAnalysisResults &AR, LPMUpdater &Updater) {
+            AR.SE.forgetLoop(&L);
+            Loop0BB.getTerminator()->replaceUsesOfWith(NewLoop03BB, &Loop0BB);
+            DeleteLoopBlocks(L, Loop0BB, AR, Updater);
+            return PreservedAnalyses::all();
+          }));
+
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(Invoke(getLoopAnalysisResult));
+  EXPECT_CALL(MLPHandle, run(HasName("loop.0"), _, _, _))
+      .WillOnce(
+          Invoke([&](Loop &L, LoopAnalysisManager &AM,
+                     LoopStandardAnalysisResults &AR, LPMUpdater &Updater) {
+            AR.SE.forgetLoop(&L);
+            EntryBB.getTerminator()->replaceUsesOfWith(&Loop0BB, &EndBB);
+            DeleteLoopBlocks(L, EntryBB, AR, Updater);
+            return PreservedAnalyses::all();
+          }));
+
+  // Add the function pass pipeline now that it is fully built up and run it
+  // over the module's one function.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  MPM.run(*M, MAM);
+}
+}
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index aace43c592e1..e7895157cd07 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -11,6 +11,10 @@ REM
 REM   Visual Studio 2015, CMake, Ninja, SVN, GNUWin32, SWIG, Python 3,
 REM   NSIS with the strlen_8192 patch,
 REM   Visual Studio 2015 SDK (for the clang-format plugin).
+REM
+REM
+REM   For LLDB, SWIG version <= 3.0.8 needs to be used to work around
+REM   https://github.com/swig/swig/issues/769
 
 
 REM You need to modify the paths below:
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index 16a354512741..a50733af9aae 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -15,6 +15,8 @@
 include_directories(
   googletest/include
   googletest
+  googlemock/include
+  googlemock
   )
 
 if(WIN32)
@@ -45,6 +47,7 @@ endif()
 
 add_llvm_library(gtest
   googletest/src/gtest-all.cc
+  googlemock/src/gmock-all.cc
 
   LINK_LIBS
   ${LIBS}
diff --git a/utils/unittest/UnitTestMain/TestMain.cpp b/utils/unittest/UnitTestMain/TestMain.cpp
index 36cec2d47424..5660b44f41a3 100644
--- a/utils/unittest/UnitTestMain/TestMain.cpp
+++ b/utils/unittest/UnitTestMain/TestMain.cpp
@@ -9,9 +9,9 @@
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Signals.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-
 #if defined(_WIN32)
 # include <windows.h>
 # if defined(_MSC_VER)
@@ -24,7 +24,10 @@ const char *TestMainArgv0;
 int main(int argc, char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0],
                                           true /* Disable crash reporting */);
-  testing::InitGoogleTest(&argc, argv);
+
+  // Initialize both gmock and gtest.
+  testing::InitGoogleMock(&argc, argv);
+
   llvm::cl::ParseCommandLineOptions(argc, argv);
 
   // Make it easy for a test to re-execute itself by saving argv[0].
diff --git a/utils/unittest/googlemock/LICENSE.txt b/utils/unittest/googlemock/LICENSE.txt
new file mode 100644
index 000000000000..1941a11f8ce9
--- /dev/null
+++ b/utils/unittest/googlemock/LICENSE.txt
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/utils/unittest/googlemock/README.LLVM b/utils/unittest/googlemock/README.LLVM
new file mode 100644
index 000000000000..9badc7d4a707
--- /dev/null
+++ b/utils/unittest/googlemock/README.LLVM
@@ -0,0 +1,17 @@
+LLVM notes
+----------
+
+This directory contains the 'googlemock' component of Google Test 1.8.0, with
+all elements removed except for the actual source code, to minimize the
+addition to the LLVM distribution.
+
+Cleaned up as follows:
+
+# Remove all the unnecessary files and directories
+$ rm -f CMakeLists.txt configure* Makefile* CHANGES CONTRIBUTORS README README.md .gitignore
+$ rm -rf build-aux make msvc scripts test docs
+$ rm -f `find . -name \*\.pump`
+$ rm -f src/gmock_main.cc
+
+# Put the license in the consistent place for LLVM.
+$ mv LICENSE LICENSE.TXT
diff --git a/utils/unittest/googlemock/include/gmock/gmock-actions.h b/utils/unittest/googlemock/include/gmock/gmock-actions.h
new file mode 100644
index 000000000000..b3f654af348d
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-actions.h
@@ -0,0 +1,1205 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used actions.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif
+
+#include <algorithm>
+#include <string>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+
+#if GTEST_HAS_STD_TYPE_TRAITS_  // Defined by gtest-port.h via gmock-port.h.
+#include <type_traits>
+#endif
+
+namespace testing {
+
+// To implement an action Foo, define:
+//   1. a class FooAction that implements the ActionInterface interface, and
+//   2. a factory function that creates an Action object from a
+//      const FooAction*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Action objects can now be copied like plain values.
+
+namespace internal {
+
+template <typename F1, typename F2>
+class ActionAdaptor;
+
+// BuiltInDefaultValueGetter<T, true>::Get() returns a
+// default-constructed T value.  BuiltInDefaultValueGetter<T,
+// false>::Get() crashes with an error.
+//
+// This primary template is used when kDefaultConstructible is true.
+template <typename T, bool kDefaultConstructible>
+struct BuiltInDefaultValueGetter {
+  static T Get() { return T(); }
+};
+template <typename T>
+struct BuiltInDefaultValueGetter<T, false> {
+  static T Get() {
+    Assert(false, __FILE__, __LINE__,
+           "Default action undefined for the function return type.");
+    return internal::Invalid<T>();
+    // The above statement will never be reached, but is required in
+    // order for this function to compile.
+  }
+};
+
+// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
+// for type T, which is NULL when T is a raw pointer type, 0 when T is
+// a numeric type, false when T is bool, or "" when T is string or
+// std::string.  In addition, in C++11 and above, it turns a
+// default-constructed T value if T is default constructible.  For any
+// other type T, the built-in default T value is undefined, and the
+// function will abort the process.
+template <typename T>
+class BuiltInDefaultValue {
+ public:
+#if GTEST_HAS_STD_TYPE_TRAITS_
+  // This function returns true iff type T has a built-in default value.
+  static bool Exists() {
+    return ::std::is_default_constructible<T>::value;
+  }
+
+  static T Get() {
+    return BuiltInDefaultValueGetter<
+        T, ::std::is_default_constructible<T>::value>::Get();
+  }
+
+#else  // GTEST_HAS_STD_TYPE_TRAITS_
+  // This function returns true iff type T has a built-in default value.
+  static bool Exists() {
+    return false;
+  }
+
+  static T Get() {
+    return BuiltInDefaultValueGetter<T, false>::Get();
+  }
+
+#endif  // GTEST_HAS_STD_TYPE_TRAITS_
+};
+
+// This partial specialization says that we use the same built-in
+// default value for T and const T.
+template <typename T>
+class BuiltInDefaultValue<const T> {
+ public:
+  static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
+  static T Get() { return BuiltInDefaultValue<T>::Get(); }
+};
+
+// This partial specialization defines the default values for pointer
+// types.
+template <typename T>
+class BuiltInDefaultValue<T*> {
+ public:
+  static bool Exists() { return true; }
+  static T* Get() { return NULL; }
+};
+
+// The following specializations define the default values for
+// specific types we care about.
+#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
+  template <> \
+  class BuiltInDefaultValue<type> { \
+   public: \
+    static bool Exists() { return true; } \
+    static type Get() { return value; } \
+  }
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
+#if GTEST_HAS_GLOBAL_STRING
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::string, "");
+#endif  // GTEST_HAS_GLOBAL_STRING
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
+
+// There's no need for a default action for signed wchar_t, as that
+// type is the same as wchar_t for gcc, and invalid for MSVC.
+//
+// There's also no need for a default action for unsigned wchar_t, as
+// that type is the same as unsigned int for gcc, and invalid for
+// MSVC.
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U);  // NOLINT
+#endif
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(UInt64, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(Int64, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
+
+#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
+
+}  // namespace internal
+
+// When an unexpected function call is encountered, Google Mock will
+// let it return a default value if the user has specified one for its
+// return type, or if the return type has a built-in default value;
+// otherwise Google Mock won't know what value to return and will have
+// to abort the process.
+//
+// The DefaultValue<T> class allows a user to specify the
+// default value for a type T that is both copyable and publicly
+// destructible (i.e. anything that can be used as a function return
+// type).  The usage is:
+//
+//   // Sets the default value for type T to be foo.
+//   DefaultValue<T>::Set(foo);
+template <typename T>
+class DefaultValue {
+ public:
+  // Sets the default value for type T; requires T to be
+  // copy-constructable and have a public destructor.
+  static void Set(T x) {
+    delete producer_;
+    producer_ = new FixedValueProducer(x);
+  }
+
+  // Provides a factory function to be called to generate the default value.
+  // This method can be used even if T is only move-constructible, but it is not
+  // limited to that case.
+  typedef T (*FactoryFunction)();
+  static void SetFactory(FactoryFunction factory) {
+    delete producer_;
+    producer_ = new FactoryValueProducer(factory);
+  }
+
+  // Unsets the default value for type T.
+  static void Clear() {
+    delete producer_;
+    producer_ = NULL;
+  }
+
+  // Returns true iff the user has set the default value for type T.
+  static bool IsSet() { return producer_ != NULL; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
+  }
+
+  // Returns the default value for type T if the user has set one;
+  // otherwise returns the built-in default value. Requires that Exists()
+  // is true, which ensures that the return value is well-defined.
+  static T Get() {
+    return producer_ == NULL ?
+        internal::BuiltInDefaultValue<T>::Get() : producer_->Produce();
+  }
+
+ private:
+  class ValueProducer {
+   public:
+    virtual ~ValueProducer() {}
+    virtual T Produce() = 0;
+  };
+
+  class FixedValueProducer : public ValueProducer {
+   public:
+    explicit FixedValueProducer(T value) : value_(value) {}
+    virtual T Produce() { return value_; }
+
+   private:
+    const T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer);
+  };
+
+  class FactoryValueProducer : public ValueProducer {
+   public:
+    explicit FactoryValueProducer(FactoryFunction factory)
+        : factory_(factory) {}
+    virtual T Produce() { return factory_(); }
+
+   private:
+    const FactoryFunction factory_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer);
+  };
+
+  static ValueProducer* producer_;
+};
+
+// This partial specialization allows a user to set default values for
+// reference types.
+template <typename T>
+class DefaultValue<T&> {
+ public:
+  // Sets the default value for type T&.
+  static void Set(T& x) {  // NOLINT
+    address_ = &x;
+  }
+
+  // Unsets the default value for type T&.
+  static void Clear() {
+    address_ = NULL;
+  }
+
+  // Returns true iff the user has set the default value for type T&.
+  static bool IsSet() { return address_ != NULL; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
+  }
+
+  // Returns the default value for type T& if the user has set one;
+  // otherwise returns the built-in default value if there is one;
+  // otherwise aborts the process.
+  static T& Get() {
+    return address_ == NULL ?
+        internal::BuiltInDefaultValue<T&>::Get() : *address_;
+  }
+
+ private:
+  static T* address_;
+};
+
+// This specialization allows DefaultValue<void>::Get() to
+// compile.
+template <>
+class DefaultValue<void> {
+ public:
+  static bool Exists() { return true; }
+  static void Get() {}
+};
+
+// Points to the user-set default value for type T.
+template <typename T>
+typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = NULL;
+
+// Points to the user-set default value for type T&.
+template <typename T>
+T* DefaultValue<T&>::address_ = NULL;
+
+// Implement this interface to define an action for function type F.
+template <typename F>
+class ActionInterface {
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  ActionInterface() {}
+  virtual ~ActionInterface() {}
+
+  // Performs the action.  This method is not const, as in general an
+  // action can have side effects and be stateful.  For example, a
+  // get-the-next-element-from-the-collection action will need to
+  // remember the current element.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface);
+};
+
+// An Action<F> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function
+// of type F is called.  The implementation of Action<T> is just a
+// linked_ptr to const ActionInterface<T>, so copying is fairly cheap.
+// Don't inherit from Action!
+//
+// You can view an object implementing ActionInterface<F> as a
+// concrete action (including its current state), and an Action<F>
+// object as a handle to it.
+template <typename F>
+class Action {
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  // Constructs a null Action.  Needed for storing Action objects in
+  // STL containers.
+  Action() : impl_(NULL) {}
+
+  // Constructs an Action from its implementation.  A NULL impl is
+  // used to represent the "do-default" action.
+  explicit Action(ActionInterface<F>* impl) : impl_(impl) {}
+
+  // Copy constructor.
+  Action(const Action& action) : impl_(action.impl_) {}
+
+  // This constructor allows us to turn an Action<Func> object into an
+  // Action<F>, as long as F's arguments can be implicitly converted
+  // to Func's and Func's return type can be implicitly converted to
+  // F's.
+  template <typename Func>
+  explicit Action(const Action<Func>& action);
+
+  // Returns true iff this is the DoDefault() action.
+  bool IsDoDefault() const { return impl_.get() == NULL; }
+
+  // Performs the action.  Note that this method is const even though
+  // the corresponding method in ActionInterface is not.  The reason
+  // is that a const Action<F> means that it cannot be re-bound to
+  // another concrete action, not that the concrete action it binds to
+  // cannot change state.  (Think of the difference between a const
+  // pointer and a pointer to const.)
+  Result Perform(const ArgumentTuple& args) const {
+    internal::Assert(
+        !IsDoDefault(), __FILE__, __LINE__,
+        "You are using DoDefault() inside a composite action like "
+        "DoAll() or WithArgs().  This is not supported for technical "
+        "reasons.  Please instead spell out the default action, or "
+        "assign the default action to an Action variable and use "
+        "the variable in various places.");
+    return impl_->Perform(args);
+  }
+
+ private:
+  template <typename F1, typename F2>
+  friend class internal::ActionAdaptor;
+
+  internal::linked_ptr<ActionInterface<F> > impl_;
+};
+
+// The PolymorphicAction class template makes it easy to implement a
+// polymorphic action (i.e. an action that can be used in mock
+// functions of than one type, e.g. Return()).
+//
+// To define a polymorphic action, a user first provides a COPYABLE
+// implementation class that has a Perform() method template:
+//
+//   class FooAction {
+//    public:
+//     template <typename Result, typename ArgumentTuple>
+//     Result Perform(const ArgumentTuple& args) const {
+//       // Processes the arguments and returns a result, using
+//       // tr1::get<N>(args) to get the N-th (0-based) argument in the tuple.
+//     }
+//     ...
+//   };
+//
+// Then the user creates the polymorphic action using
+// MakePolymorphicAction(object) where object has type FooAction.  See
+// the definition of Return(void) and SetArgumentPointee<N>(value) for
+// complete examples.
+template <typename Impl>
+class PolymorphicAction {
+ public:
+  explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    return Action<F>(new MonomorphicImpl<F>(impl_));
+  }
+
+ private:
+  template <typename F>
+  class MonomorphicImpl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    virtual Result Perform(const ArgumentTuple& args) {
+      return impl_.template Perform<Result>(args);
+    }
+
+   private:
+    Impl impl_;
+
+    GTEST_DISALLOW_ASSIGN_(MonomorphicImpl);
+  };
+
+  Impl impl_;
+
+  GTEST_DISALLOW_ASSIGN_(PolymorphicAction);
+};
+
+// Creates an Action from its implementation and returns it.  The
+// created Action object owns the implementation.
+template <typename F>
+Action<F> MakeAction(ActionInterface<F>* impl) {
+  return Action<F>(impl);
+}
+
+// Creates a polymorphic action from its implementation.  This is
+// easier to use than the PolymorphicAction<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicAction(foo);
+// vs
+//   PolymorphicAction<TypeOfFoo>(foo);
+template <typename Impl>
+inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
+  return PolymorphicAction<Impl>(impl);
+}
+
+namespace internal {
+
+// Allows an Action<F2> object to pose as an Action<F1>, as long as F2
+// and F1 are compatible.
+template <typename F1, typename F2>
+class ActionAdaptor : public ActionInterface<F1> {
+ public:
+  typedef typename internal::Function<F1>::Result Result;
+  typedef typename internal::Function<F1>::ArgumentTuple ArgumentTuple;
+
+  explicit ActionAdaptor(const Action<F2>& from) : impl_(from.impl_) {}
+
+  virtual Result Perform(const ArgumentTuple& args) {
+    return impl_->Perform(args);
+  }
+
+ private:
+  const internal::linked_ptr<ActionInterface<F2> > impl_;
+
+  GTEST_DISALLOW_ASSIGN_(ActionAdaptor);
+};
+
+// Helper struct to specialize ReturnAction to execute a move instead of a copy
+// on return. Useful for move-only types, but could be used on any type.
+template <typename T>
+struct ByMoveWrapper {
+  explicit ByMoveWrapper(T value) : payload(internal::move(value)) {}
+  T payload;
+};
+
+// Implements the polymorphic Return(x) action, which can be used in
+// any function that returns the type of x, regardless of the argument
+// types.
+//
+// Note: The value passed into Return must be converted into
+// Function<F>::Result when this action is cast to Action<F> rather than
+// when that action is performed. This is important in scenarios like
+//
+// MOCK_METHOD1(Method, T(U));
+// ...
+// {
+//   Foo foo;
+//   X x(&foo);
+//   EXPECT_CALL(mock, Method(_)).WillOnce(Return(x));
+// }
+//
+// In the example above the variable x holds reference to foo which leaves
+// scope and gets destroyed.  If copying X just copies a reference to foo,
+// that copy will be left with a hanging reference.  If conversion to T
+// makes a copy of foo, the above code is safe. To support that scenario, we
+// need to make sure that the type conversion happens inside the EXPECT_CALL
+// statement, and conversion of the result of Return to Action<T(U)> is a
+// good place for that.
+//
+template <typename R>
+class ReturnAction {
+ public:
+  // Constructs a ReturnAction object from the value to be returned.
+  // 'value' is passed by value instead of by const reference in order
+  // to allow Return("string literal") to compile.
+  explicit ReturnAction(R value) : value_(new R(internal::move(value))) {}
+
+  // This template type conversion operator allows Return(x) to be
+  // used in ANY function that returns x's type.
+  template <typename F>
+  operator Action<F>() const {
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename Function<F>::Result Result;
+    GTEST_COMPILE_ASSERT_(
+        !is_reference<Result>::value,
+        use_ReturnRef_instead_of_Return_to_return_a_reference);
+    return Action<F>(new Impl<R, F>(value_));
+  }
+
+ private:
+  // Implements the Return(x) action for a particular function type F.
+  template <typename R_, typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    // The implicit cast is necessary when Result has more than one
+    // single-argument constructor (e.g. Result is std::vector<int>) and R
+    // has a type conversion operator template.  In that case, value_(value)
+    // won't compile as the compiler doesn't known which constructor of
+    // Result to call.  ImplicitCast_ forces the compiler to convert R to
+    // Result without considering explicit constructors, thus resolving the
+    // ambiguity. value_ is then initialized using its copy constructor.
+    explicit Impl(const linked_ptr<R>& value)
+        : value_before_cast_(*value),
+          value_(ImplicitCast_<Result>(value_before_cast_)) {}
+
+    virtual Result Perform(const ArgumentTuple&) { return value_; }
+
+   private:
+    GTEST_COMPILE_ASSERT_(!is_reference<Result>::value,
+                          Result_cannot_be_a_reference_type);
+    // We save the value before casting just in case it is being cast to a
+    // wrapper type.
+    R value_before_cast_;
+    Result value_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+  // Partially specialize for ByMoveWrapper. This version of ReturnAction will
+  // move its contents instead.
+  template <typename R_, typename F>
+  class Impl<ByMoveWrapper<R_>, F> : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const linked_ptr<R>& wrapper)
+        : performed_(false), wrapper_(wrapper) {}
+
+    virtual Result Perform(const ArgumentTuple&) {
+      GTEST_CHECK_(!performed_)
+          << "A ByMove() action should only be performed once.";
+      performed_ = true;
+      return internal::move(wrapper_->payload);
+    }
+
+   private:
+    bool performed_;
+    const linked_ptr<R> wrapper_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const linked_ptr<R> value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnAction);
+};
+
+// Implements the ReturnNull() action.
+class ReturnNullAction {
+ public:
+  // Allows ReturnNull() to be used in any pointer-returning function. In C++11
+  // this is enforced by returning nullptr, and in non-C++11 by asserting a
+  // pointer type on compile time.
+  template <typename Result, typename ArgumentTuple>
+  static Result Perform(const ArgumentTuple&) {
+#if GTEST_LANG_CXX11
+    return nullptr;
+#else
+    GTEST_COMPILE_ASSERT_(internal::is_pointer<Result>::value,
+                          ReturnNull_can_be_used_to_return_a_pointer_only);
+    return NULL;
+#endif  // GTEST_LANG_CXX11
+  }
+};
+
+// Implements the Return() action.
+class ReturnVoidAction {
+ public:
+  // Allows Return() to be used in any void-returning function.
+  template <typename Result, typename ArgumentTuple>
+  static void Perform(const ArgumentTuple&) {
+    CompileAssertTypesEqual<void, Result>();
+  }
+};
+
+// Implements the polymorphic ReturnRef(x) action, which can be used
+// in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefAction {
+ public:
+  // Constructs a ReturnRefAction object from the reference to be returned.
+  explicit ReturnRefAction(T& ref) : ref_(ref) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRef(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRef(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(internal::is_reference<Result>::value,
+                          use_Return_instead_of_ReturnRef_to_return_a_value);
+    return Action<F>(new Impl<F>(ref_));
+  }
+
+ private:
+  // Implements the ReturnRef(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(T& ref) : ref_(ref) {}  // NOLINT
+
+    virtual Result Perform(const ArgumentTuple&) {
+      return ref_;
+    }
+
+   private:
+    T& ref_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& ref_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefAction);
+};
+
+// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
+// used in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefOfCopyAction {
+ public:
+  // Constructs a ReturnRefOfCopyAction object from the reference to
+  // be returned.
+  explicit ReturnRefOfCopyAction(const T& value) : value_(value) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRefOfCopy(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRefOfCopy(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(
+        internal::is_reference<Result>::value,
+        use_Return_instead_of_ReturnRefOfCopy_to_return_a_value);
+    return Action<F>(new Impl<F>(value_));
+  }
+
+ private:
+  // Implements the ReturnRefOfCopy(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const T& value) : value_(value) {}  // NOLINT
+
+    virtual Result Perform(const ArgumentTuple&) {
+      return value_;
+    }
+
+   private:
+    T value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const T value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefOfCopyAction);
+};
+
+// Implements the polymorphic DoDefault() action.
+class DoDefaultAction {
+ public:
+  // This template type conversion operator allows DoDefault() to be
+  // used in any function.
+  template <typename F>
+  operator Action<F>() const { return Action<F>(NULL); }
+};
+
+// Implements the Assign action to set a given pointer referent to a
+// particular value.
+template <typename T1, typename T2>
+class AssignAction {
+ public:
+  AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& /* args */) const {
+    *ptr_ = value_;
+  }
+
+ private:
+  T1* const ptr_;
+  const T2 value_;
+
+  GTEST_DISALLOW_ASSIGN_(AssignAction);
+};
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetErrnoAndReturn action to simulate return from
+// various system calls and libc functions.
+template <typename T>
+class SetErrnoAndReturnAction {
+ public:
+  SetErrnoAndReturnAction(int errno_value, T result)
+      : errno_(errno_value),
+        result_(result) {}
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& /* args */) const {
+    errno = errno_;
+    return result_;
+  }
+
+ private:
+  const int errno_;
+  const T result_;
+
+  GTEST_DISALLOW_ASSIGN_(SetErrnoAndReturnAction);
+};
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetArgumentPointee<N>(x) action for any function
+// whose N-th argument (0-based) is a pointer to x's type.  The
+// template parameter kIsProto is true iff type A is ProtocolMessage,
+// proto2::Message, or a sub-class of those.
+template <size_t N, typename A, bool kIsProto>
+class SetArgumentPointeeAction {
+ public:
+  // Constructs an action that sets the variable pointed to by the
+  // N-th function argument to 'value'.
+  explicit SetArgumentPointeeAction(const A& value) : value_(value) {}
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& args) const {
+    CompileAssertTypesEqual<void, Result>();
+    *::testing::get<N>(args) = value_;
+  }
+
+ private:
+  const A value_;
+
+  GTEST_DISALLOW_ASSIGN_(SetArgumentPointeeAction);
+};
+
+template <size_t N, typename Proto>
+class SetArgumentPointeeAction<N, Proto, true> {
+ public:
+  // Constructs an action that sets the variable pointed to by the
+  // N-th function argument to 'proto'.  Both ProtocolMessage and
+  // proto2::Message have the CopyFrom() method, so the same
+  // implementation works for both.
+  explicit SetArgumentPointeeAction(const Proto& proto) : proto_(new Proto) {
+    proto_->CopyFrom(proto);
+  }
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& args) const {
+    CompileAssertTypesEqual<void, Result>();
+    ::testing::get<N>(args)->CopyFrom(*proto_);
+  }
+
+ private:
+  const internal::linked_ptr<Proto> proto_;
+
+  GTEST_DISALLOW_ASSIGN_(SetArgumentPointeeAction);
+};
+
+// Implements the InvokeWithoutArgs(f) action.  The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor.  InvokeWithoutArgs(f) can be used as an
+// Action<F> as long as f's type is compatible with F (i.e. f can be
+// assigned to a tr1::function<F>).
+template <typename FunctionImpl>
+class InvokeWithoutArgsAction {
+ public:
+  // The c'tor makes a copy of function_impl (either a function
+  // pointer or a functor).
+  explicit InvokeWithoutArgsAction(FunctionImpl function_impl)
+      : function_impl_(function_impl) {}
+
+  // Allows InvokeWithoutArgs(f) to be used as any action whose type is
+  // compatible with f.
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple&) { return function_impl_(); }
+
+ private:
+  FunctionImpl function_impl_;
+
+  GTEST_DISALLOW_ASSIGN_(InvokeWithoutArgsAction);
+};
+
+// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+class InvokeMethodWithoutArgsAction {
+ public:
+  InvokeMethodWithoutArgsAction(Class* obj_ptr, MethodPtr method_ptr)
+      : obj_ptr_(obj_ptr), method_ptr_(method_ptr) {}
+
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple&) const {
+    return (obj_ptr_->*method_ptr_)();
+  }
+
+ private:
+  Class* const obj_ptr_;
+  const MethodPtr method_ptr_;
+
+  GTEST_DISALLOW_ASSIGN_(InvokeMethodWithoutArgsAction);
+};
+
+// Implements the IgnoreResult(action) action.
+template <typename A>
+class IgnoreResultAction {
+ public:
+  explicit IgnoreResultAction(const A& action) : action_(action) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename internal::Function<F>::Result Result;
+
+    // Asserts at compile time that F returns void.
+    CompileAssertTypesEqual<void, Result>();
+
+    return Action<F>(new Impl<F>(action_));
+  }
+
+ private:
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const A& action) : action_(action) {}
+
+    virtual void Perform(const ArgumentTuple& args) {
+      // Performs the action and ignores its result.
+      action_.Perform(args);
+    }
+
+   private:
+    // Type OriginalFunction is the same as F except that its return
+    // type is IgnoredValue.
+    typedef typename internal::Function<F>::MakeResultIgnoredValue
+        OriginalFunction;
+
+    const Action<OriginalFunction> action_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const A action_;
+
+  GTEST_DISALLOW_ASSIGN_(IgnoreResultAction);
+};
+
+// A ReferenceWrapper<T> object represents a reference to type T,
+// which can be either const or not.  It can be explicitly converted
+// from, and implicitly converted to, a T&.  Unlike a reference,
+// ReferenceWrapper<T> can be copied and can survive template type
+// inference.  This is used to support by-reference arguments in the
+// InvokeArgument<N>(...) action.  The idea was from "reference
+// wrappers" in tr1, which we don't have in our source tree yet.
+template <typename T>
+class ReferenceWrapper {
+ public:
+  // Constructs a ReferenceWrapper<T> object from a T&.
+  explicit ReferenceWrapper(T& l_value) : pointer_(&l_value) {}  // NOLINT
+
+  // Allows a ReferenceWrapper<T> object to be implicitly converted to
+  // a T&.
+  operator T&() const { return *pointer_; }
+ private:
+  T* pointer_;
+};
+
+// Allows the expression ByRef(x) to be printed as a reference to x.
+template <typename T>
+void PrintTo(const ReferenceWrapper<T>& ref, ::std::ostream* os) {
+  T& value = ref;
+  UniversalPrinter<T&>::Print(value, os);
+}
+
+// Does two actions sequentially.  Used for implementing the DoAll(a1,
+// a2, ...) action.
+template <typename Action1, typename Action2>
+class DoBothAction {
+ public:
+  DoBothAction(Action1 action1, Action2 action2)
+      : action1_(action1), action2_(action2) {}
+
+  // This template type conversion operator allows DoAll(a1, ..., a_n)
+  // to be used in ANY function of compatible type.
+  template <typename F>
+  operator Action<F>() const {
+    return Action<F>(new Impl<F>(action1_, action2_));
+  }
+
+ private:
+  // Implements the DoAll(...) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+    typedef typename Function<F>::MakeResultVoid VoidResult;
+
+    Impl(const Action<VoidResult>& action1, const Action<F>& action2)
+        : action1_(action1), action2_(action2) {}
+
+    virtual Result Perform(const ArgumentTuple& args) {
+      action1_.Perform(args);
+      return action2_.Perform(args);
+    }
+
+   private:
+    const Action<VoidResult> action1_;
+    const Action<F> action2_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  Action1 action1_;
+  Action2 action2_;
+
+  GTEST_DISALLOW_ASSIGN_(DoBothAction);
+};
+
+}  // namespace internal
+
+// An Unused object can be implicitly constructed from ANY value.
+// This is handy when defining actions that ignore some or all of the
+// mock function arguments.  For example, given
+//
+//   MOCK_METHOD3(Foo, double(const string& label, double x, double y));
+//   MOCK_METHOD3(Bar, double(int index, double x, double y));
+//
+// instead of
+//
+//   double DistanceToOriginWithLabel(const string& label, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   double DistanceToOriginWithIndex(int index, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXEPCT_CALL(mock, Foo("abc", _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithLabel));
+//   EXEPCT_CALL(mock, Bar(5, _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithIndex));
+//
+// you could write
+//
+//   // We can declare any uninteresting argument as Unused.
+//   double DistanceToOrigin(Unused, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXEPCT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
+//   EXEPCT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
+typedef internal::IgnoredValue Unused;
+
+// This constructor allows us to turn an Action<From> object into an
+// Action<To>, as long as To's arguments can be implicitly converted
+// to From's and From's return type cann be implicitly converted to
+// To's.
+template <typename To>
+template <typename From>
+Action<To>::Action(const Action<From>& from)
+    : impl_(new internal::ActionAdaptor<To, From>(from)) {}
+
+// Creates an action that returns 'value'.  'value' is passed by value
+// instead of const reference - otherwise Return("string literal")
+// will trigger a compiler error about using array as initializer.
+template <typename R>
+internal::ReturnAction<R> Return(R value) {
+  return internal::ReturnAction<R>(internal::move(value));
+}
+
+// Creates an action that returns NULL.
+inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
+  return MakePolymorphicAction(internal::ReturnNullAction());
+}
+
+// Creates an action that returns from a void function.
+inline PolymorphicAction<internal::ReturnVoidAction> Return() {
+  return MakePolymorphicAction(internal::ReturnVoidAction());
+}
+
+// Creates an action that returns the reference to a variable.
+template <typename R>
+inline internal::ReturnRefAction<R> ReturnRef(R& x) {  // NOLINT
+  return internal::ReturnRefAction<R>(x);
+}
+
+// Creates an action that returns the reference to a copy of the
+// argument.  The copy is created when the action is constructed and
+// lives as long as the action.
+template <typename R>
+inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
+  return internal::ReturnRefOfCopyAction<R>(x);
+}
+
+// Modifies the parent action (a Return() action) to perform a move of the
+// argument instead of a copy.
+// Return(ByMove()) actions can only be executed once and will assert this
+// invariant.
+template <typename R>
+internal::ByMoveWrapper<R> ByMove(R x) {
+  return internal::ByMoveWrapper<R>(internal::move(x));
+}
+
+// Creates an action that does the default action for the give mock function.
+inline internal::DoDefaultAction DoDefault() {
+  return internal::DoDefaultAction();
+}
+
+// Creates an action that sets the variable pointed by the N-th
+// (0-based) function argument to 'value'.
+template <size_t N, typename T>
+PolymorphicAction<
+  internal::SetArgumentPointeeAction<
+    N, T, internal::IsAProtocolMessage<T>::value> >
+SetArgPointee(const T& x) {
+  return MakePolymorphicAction(internal::SetArgumentPointeeAction<
+      N, T, internal::IsAProtocolMessage<T>::value>(x));
+}
+
+#if !((GTEST_GCC_VER_ && GTEST_GCC_VER_ < 40000) || GTEST_OS_SYMBIAN)
+// This overload allows SetArgPointee() to accept a string literal.
+// GCC prior to the version 4.0 and Symbian C++ compiler cannot distinguish
+// this overload from the templated version and emit a compile error.
+template <size_t N>
+PolymorphicAction<
+  internal::SetArgumentPointeeAction<N, const char*, false> >
+SetArgPointee(const char* p) {
+  return MakePolymorphicAction(internal::SetArgumentPointeeAction<
+      N, const char*, false>(p));
+}
+
+template <size_t N>
+PolymorphicAction<
+  internal::SetArgumentPointeeAction<N, const wchar_t*, false> >
+SetArgPointee(const wchar_t* p) {
+  return MakePolymorphicAction(internal::SetArgumentPointeeAction<
+      N, const wchar_t*, false>(p));
+}
+#endif
+
+// The following version is DEPRECATED.
+template <size_t N, typename T>
+PolymorphicAction<
+  internal::SetArgumentPointeeAction<
+    N, T, internal::IsAProtocolMessage<T>::value> >
+SetArgumentPointee(const T& x) {
+  return MakePolymorphicAction(internal::SetArgumentPointeeAction<
+      N, T, internal::IsAProtocolMessage<T>::value>(x));
+}
+
+// Creates an action that sets a pointer referent to a given value.
+template <typename T1, typename T2>
+PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
+  return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Creates an action that sets errno and returns the appropriate error.
+template <typename T>
+PolymorphicAction<internal::SetErrnoAndReturnAction<T> >
+SetErrnoAndReturn(int errval, T result) {
+  return MakePolymorphicAction(
+      internal::SetErrnoAndReturnAction<T>(errval, result));
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Various overloads for InvokeWithoutArgs().
+
+// Creates an action that invokes 'function_impl' with no argument.
+template <typename FunctionImpl>
+PolymorphicAction<internal::InvokeWithoutArgsAction<FunctionImpl> >
+InvokeWithoutArgs(FunctionImpl function_impl) {
+  return MakePolymorphicAction(
+      internal::InvokeWithoutArgsAction<FunctionImpl>(function_impl));
+}
+
+// Creates an action that invokes the given method on the given object
+// with no argument.
+template <class Class, typename MethodPtr>
+PolymorphicAction<internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> >
+InvokeWithoutArgs(Class* obj_ptr, MethodPtr method_ptr) {
+  return MakePolymorphicAction(
+      internal::InvokeMethodWithoutArgsAction<Class, MethodPtr>(
+          obj_ptr, method_ptr));
+}
+
+// Creates an action that performs an_action and throws away its
+// result.  In other words, it changes the return type of an_action to
+// void.  an_action MUST NOT return void, or the code won't compile.
+template <typename A>
+inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
+  return internal::IgnoreResultAction<A>(an_action);
+}
+
+// Creates a reference wrapper for the given L-value.  If necessary,
+// you can explicitly specify the type of the reference.  For example,
+// suppose 'derived' is an object of type Derived, ByRef(derived)
+// would wrap a Derived&.  If you want to wrap a const Base& instead,
+// where Base is a base class of Derived, just write:
+//
+//   ByRef<const Base>(derived)
+template <typename T>
+inline internal::ReferenceWrapper<T> ByRef(T& l_value) {  // NOLINT
+  return internal::ReferenceWrapper<T>(l_value);
+}
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-cardinalities.h b/utils/unittest/googlemock/include/gmock/gmock-cardinalities.h
new file mode 100644
index 000000000000..fc315f92ab5b
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-cardinalities.h
@@ -0,0 +1,147 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used cardinalities.  More
+// cardinalities can be defined by the user implementing the
+// CardinalityInterface interface if necessary.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+
+#include <limits.h>
+#include <ostream>  // NOLINT
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// To implement a cardinality Foo, define:
+//   1. a class FooCardinality that implements the
+//      CardinalityInterface interface, and
+//   2. a factory function that creates a Cardinality object from a
+//      const FooCardinality*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Cardinality objects can now be copied like plain values.
+
+// The implementation of a cardinality.
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface() {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  virtual int ConservativeLowerBound() const { return 0; }
+  virtual int ConservativeUpperBound() const { return INT_MAX; }
+
+  // Returns true iff call_count calls will satisfy this cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true iff call_count calls will saturate this cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+
+// A Cardinality is a copyable and IMMUTABLE (except by assignment)
+// object that specifies how many times a mock function is expected to
+// be called.  The implementation of Cardinality is just a linked_ptr
+// to const CardinalityInterface, so copying is fairly cheap.
+// Don't inherit from Cardinality!
+class GTEST_API_ Cardinality {
+ public:
+  // Constructs a null cardinality.  Needed for storing Cardinality
+  // objects in STL containers.
+  Cardinality() {}
+
+  // Constructs a Cardinality from its implementation.
+  explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
+  int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
+
+  // Returns true iff call_count calls will satisfy this cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const {
+    return impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Returns true iff call_count calls will saturate this cardinality.
+  bool IsSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count);
+  }
+
+  // Returns true iff call_count calls will over-saturate this
+  // cardinality, i.e. exceed the maximum number of allowed calls.
+  bool IsOverSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count) &&
+        !impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Describes self to an ostream
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the given actual call count to an ostream.
+  static void DescribeActualCallCountTo(int actual_call_count,
+                                        ::std::ostream* os);
+
+ private:
+  internal::linked_ptr<const CardinalityInterface> impl_;
+};
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n);
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n);
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber();
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max);
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n);
+
+// Creates a cardinality from its implementation.
+inline Cardinality MakeCardinality(const CardinalityInterface* c) {
+  return Cardinality(c);
+}
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-generated-actions.h b/utils/unittest/googlemock/include/gmock/gmock-generated-actions.h
new file mode 100644
index 000000000000..b5a889c0c3a4
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-generated-actions.h
@@ -0,0 +1,2377 @@
+// This file was GENERATED by a script.  DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+namespace internal {
+
+// InvokeHelper<F> knows how to unpack an N-tuple and invoke an N-ary
+// function or method with the unpacked values, where F is a function
+// type that takes N arguments.
+template <typename Result, typename ArgumentTuple>
+class InvokeHelper;
+
+template <typename R>
+class InvokeHelper<R, ::testing::tuple<> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<>&) {
+           return function();
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<>&) {
+           return (obj_ptr->*method_ptr)();
+  }
+};
+
+template <typename R, typename A1>
+class InvokeHelper<R, ::testing::tuple<A1> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1>& args) {
+           return function(get<0>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2>
+class InvokeHelper<R, ::testing::tuple<A1, A2> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2>& args) {
+           return function(get<0>(args), get<1>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3,
+      A4>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4,
+      A5>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5, A6> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4, A5,
+      A6>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args), get<5>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5, A6>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args), get<5>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5, A6, A7> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4, A5,
+      A6, A7>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args), get<5>(args), get<6>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5, A6,
+                            A7>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args), get<5>(args),
+               get<6>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4, A5,
+      A6, A7, A8>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args), get<5>(args), get<6>(args),
+               get<7>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5, A6, A7,
+                            A8>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args), get<5>(args),
+               get<6>(args), get<7>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4, A5,
+      A6, A7, A8, A9>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args), get<5>(args), get<6>(args),
+               get<7>(args), get<8>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8,
+                            A9>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args), get<5>(args),
+               get<6>(args), get<7>(args), get<8>(args));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9,
+    typename A10>
+class InvokeHelper<R, ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9,
+    A10> > {
+ public:
+  template <typename Function>
+  static R Invoke(Function function, const ::testing::tuple<A1, A2, A3, A4, A5,
+      A6, A7, A8, A9, A10>& args) {
+           return function(get<0>(args), get<1>(args), get<2>(args),
+               get<3>(args), get<4>(args), get<5>(args), get<6>(args),
+               get<7>(args), get<8>(args), get<9>(args));
+  }
+
+  template <class Class, typename MethodPtr>
+  static R InvokeMethod(Class* obj_ptr,
+                        MethodPtr method_ptr,
+                        const ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8,
+                            A9, A10>& args) {
+           return (obj_ptr->*method_ptr)(get<0>(args), get<1>(args),
+               get<2>(args), get<3>(args), get<4>(args), get<5>(args),
+               get<6>(args), get<7>(args), get<8>(args), get<9>(args));
+  }
+};
+
+// An INTERNAL macro for extracting the type of a tuple field.  It's
+// subject to change without notice - DO NOT USE IN USER CODE!
+#define GMOCK_FIELD_(Tuple, N) \
+    typename ::testing::tuple_element<N, Tuple>::type
+
+// SelectArgs<Result, ArgumentTuple, k1, k2, ..., k_n>::type is the
+// type of an n-ary function whose i-th (1-based) argument type is the
+// k{i}-th (0-based) field of ArgumentTuple, which must be a tuple
+// type, and whose return type is Result.  For example,
+//   SelectArgs<int, ::testing::tuple<bool, char, double, long>, 0, 3>::type
+// is int(bool, long).
+//
+// SelectArgs<Result, ArgumentTuple, k1, k2, ..., k_n>::Select(args)
+// returns the selected fields (k1, k2, ..., k_n) of args as a tuple.
+// For example,
+//   SelectArgs<int, tuple<bool, char, double>, 2, 0>::Select(
+//       ::testing::make_tuple(true, 'a', 2.5))
+// returns tuple (2.5, true).
+//
+// The numbers in list k1, k2, ..., k_n must be >= 0, where n can be
+// in the range [0, 10].  Duplicates are allowed and they don't have
+// to be in an ascending or descending order.
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5, int k6, int k7, int k8, int k9, int k10>
+class SelectArgs {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5),
+      GMOCK_FIELD_(ArgumentTuple, k6), GMOCK_FIELD_(ArgumentTuple, k7),
+      GMOCK_FIELD_(ArgumentTuple, k8), GMOCK_FIELD_(ArgumentTuple, k9),
+      GMOCK_FIELD_(ArgumentTuple, k10));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args), get<k6>(args), get<k7>(args),
+        get<k8>(args), get<k9>(args), get<k10>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple>
+class SelectArgs<Result, ArgumentTuple,
+                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type();
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& /* args */) {
+    return SelectedArgs();
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, -1, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, k5, -1, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5, int k6>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, k5, k6, -1, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5),
+      GMOCK_FIELD_(ArgumentTuple, k6));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args), get<k6>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5, int k6, int k7>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, k5, k6, k7, -1, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5),
+      GMOCK_FIELD_(ArgumentTuple, k6), GMOCK_FIELD_(ArgumentTuple, k7));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args), get<k6>(args), get<k7>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5, int k6, int k7, int k8>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, k5, k6, k7, k8, -1, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5),
+      GMOCK_FIELD_(ArgumentTuple, k6), GMOCK_FIELD_(ArgumentTuple, k7),
+      GMOCK_FIELD_(ArgumentTuple, k8));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args), get<k6>(args), get<k7>(args),
+        get<k8>(args));
+  }
+};
+
+template <typename Result, typename ArgumentTuple, int k1, int k2, int k3,
+    int k4, int k5, int k6, int k7, int k8, int k9>
+class SelectArgs<Result, ArgumentTuple,
+                 k1, k2, k3, k4, k5, k6, k7, k8, k9, -1> {
+ public:
+  typedef Result type(GMOCK_FIELD_(ArgumentTuple, k1),
+      GMOCK_FIELD_(ArgumentTuple, k2), GMOCK_FIELD_(ArgumentTuple, k3),
+      GMOCK_FIELD_(ArgumentTuple, k4), GMOCK_FIELD_(ArgumentTuple, k5),
+      GMOCK_FIELD_(ArgumentTuple, k6), GMOCK_FIELD_(ArgumentTuple, k7),
+      GMOCK_FIELD_(ArgumentTuple, k8), GMOCK_FIELD_(ArgumentTuple, k9));
+  typedef typename Function<type>::ArgumentTuple SelectedArgs;
+  static SelectedArgs Select(const ArgumentTuple& args) {
+    return SelectedArgs(get<k1>(args), get<k2>(args), get<k3>(args),
+        get<k4>(args), get<k5>(args), get<k6>(args), get<k7>(args),
+        get<k8>(args), get<k9>(args));
+  }
+};
+
+#undef GMOCK_FIELD_
+
+// Implements the WithArgs action.
+template <typename InnerAction, int k1 = -1, int k2 = -1, int k3 = -1,
+    int k4 = -1, int k5 = -1, int k6 = -1, int k7 = -1, int k8 = -1,
+    int k9 = -1, int k10 = -1>
+class WithArgsAction {
+ public:
+  explicit WithArgsAction(const InnerAction& action) : action_(action) {}
+
+  template <typename F>
+  operator Action<F>() const { return MakeAction(new Impl<F>(action_)); }
+
+ private:
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const InnerAction& action) : action_(action) {}
+
+    virtual Result Perform(const ArgumentTuple& args) {
+      return action_.Perform(SelectArgs<Result, ArgumentTuple, k1, k2, k3, k4,
+          k5, k6, k7, k8, k9, k10>::Select(args));
+    }
+
+   private:
+    typedef typename SelectArgs<Result, ArgumentTuple,
+        k1, k2, k3, k4, k5, k6, k7, k8, k9, k10>::type InnerFunctionType;
+
+    Action<InnerFunctionType> action_;
+  };
+
+  const InnerAction action_;
+
+  GTEST_DISALLOW_ASSIGN_(WithArgsAction);
+};
+
+// A macro from the ACTION* family (defined later in this file)
+// defines an action that can be used in a mock function.  Typically,
+// these actions only care about a subset of the arguments of the mock
+// function.  For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs.  The ExcessiveArg type is used to
+// represent those excessive arguments.  In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal.  However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// A helper class needed for implementing the ACTION* macros.
+template <typename Result, class Impl>
+class ActionHelper {
+ public:
+  static Result Perform(Impl* impl, const ::testing::tuple<>& args) {
+    return impl->template gmock_PerformImpl<>(args, ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0>& args) {
+    return impl->template gmock_PerformImpl<A0>(args, get<0>(args),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1>& args) {
+    return impl->template gmock_PerformImpl<A0, A1>(args, get<0>(args),
+        get<1>(args), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2>(args, get<0>(args),
+        get<1>(args), get<2>(args), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2,
+      A3>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3>(args, get<0>(args),
+        get<1>(args), get<2>(args), get<3>(args), ExcessiveArg(),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3,
+      A4>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4>(args,
+        get<0>(args), get<1>(args), get<2>(args), get<3>(args), get<4>(args),
+        ExcessiveArg(), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3, A4,
+      A5>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5>(args,
+        get<0>(args), get<1>(args), get<2>(args), get<3>(args), get<4>(args),
+        get<5>(args), ExcessiveArg(), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3, A4,
+      A5, A6>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6>(args,
+        get<0>(args), get<1>(args), get<2>(args), get<3>(args), get<4>(args),
+        get<5>(args), get<6>(args), ExcessiveArg(), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3, A4,
+      A5, A6, A7>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6,
+        A7>(args, get<0>(args), get<1>(args), get<2>(args), get<3>(args),
+        get<4>(args), get<5>(args), get<6>(args), get<7>(args), ExcessiveArg(),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7, typename A8>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3, A4,
+      A5, A6, A7, A8>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6, A7,
+        A8>(args, get<0>(args), get<1>(args), get<2>(args), get<3>(args),
+        get<4>(args), get<5>(args), get<6>(args), get<7>(args), get<8>(args),
+        ExcessiveArg());
+  }
+
+  template <typename A0, typename A1, typename A2, typename A3, typename A4,
+      typename A5, typename A6, typename A7, typename A8, typename A9>
+  static Result Perform(Impl* impl, const ::testing::tuple<A0, A1, A2, A3, A4,
+      A5, A6, A7, A8, A9>& args) {
+    return impl->template gmock_PerformImpl<A0, A1, A2, A3, A4, A5, A6, A7, A8,
+        A9>(args, get<0>(args), get<1>(args), get<2>(args), get<3>(args),
+        get<4>(args), get<5>(args), get<6>(args), get<7>(args), get<8>(args),
+        get<9>(args));
+  }
+};
+
+}  // namespace internal
+
+// Various overloads for Invoke().
+
+// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
+// the selected arguments of the mock function to an_action and
+// performs it.  It serves as an adaptor between actions with
+// different argument lists.  C++ doesn't support default arguments for
+// function templates, so we have to overload it.
+template <int k1, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1>(action);
+}
+
+template <int k1, int k2, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2>(action);
+}
+
+template <int k1, int k2, int k3, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3>(action);
+}
+
+template <int k1, int k2, int k3, int k4, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7,
+    typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6,
+      k7>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7, k8>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7,
+      k8>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    int k9, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7, k8, k9>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7, k8,
+      k9>(action);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    int k9, int k10, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7, k8,
+    k9, k10>
+WithArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k1, k2, k3, k4, k5, k6, k7, k8,
+      k9, k10>(action);
+}
+
+// Creates an action that does actions a1, a2, ..., sequentially in
+// each invocation.
+template <typename Action1, typename Action2>
+inline internal::DoBothAction<Action1, Action2>
+DoAll(Action1 a1, Action2 a2) {
+  return internal::DoBothAction<Action1, Action2>(a1, a2);
+}
+
+template <typename Action1, typename Action2, typename Action3>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    Action3> >
+DoAll(Action1 a1, Action2 a2, Action3 a3) {
+  return DoAll(a1, DoAll(a2, a3));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, Action4> > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4) {
+  return DoAll(a1, DoAll(a2, a3, a4));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    Action5> > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5, typename Action6>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    internal::DoBothAction<Action5, Action6> > > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5, Action6 a6) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5, a6));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5, typename Action6, typename Action7>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    internal::DoBothAction<Action5, internal::DoBothAction<Action6,
+    Action7> > > > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5, Action6 a6,
+    Action7 a7) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5, a6, a7));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5, typename Action6, typename Action7,
+    typename Action8>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    internal::DoBothAction<Action5, internal::DoBothAction<Action6,
+    internal::DoBothAction<Action7, Action8> > > > > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5, Action6 a6,
+    Action7 a7, Action8 a8) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5, a6, a7, a8));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5, typename Action6, typename Action7,
+    typename Action8, typename Action9>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    internal::DoBothAction<Action5, internal::DoBothAction<Action6,
+    internal::DoBothAction<Action7, internal::DoBothAction<Action8,
+    Action9> > > > > > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5, Action6 a6,
+    Action7 a7, Action8 a8, Action9 a9) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5, a6, a7, a8, a9));
+}
+
+template <typename Action1, typename Action2, typename Action3,
+    typename Action4, typename Action5, typename Action6, typename Action7,
+    typename Action8, typename Action9, typename Action10>
+inline internal::DoBothAction<Action1, internal::DoBothAction<Action2,
+    internal::DoBothAction<Action3, internal::DoBothAction<Action4,
+    internal::DoBothAction<Action5, internal::DoBothAction<Action6,
+    internal::DoBothAction<Action7, internal::DoBothAction<Action8,
+    internal::DoBothAction<Action9, Action10> > > > > > > > >
+DoAll(Action1 a1, Action2 a2, Action3 a3, Action4 a4, Action5 a5, Action6 a6,
+    Action7 a7, Action8 a8, Action9 a9, Action10 a10) {
+  return DoAll(a1, DoAll(a2, a3, a4, a5, a6, a7, a8, a9, a10));
+}
+
+}  // namespace testing
+
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily.  The syntax:
+//
+//   ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements.  The value returned by the statements will be used as
+// the return value of the action.  Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'.  For example:
+//
+//   ACTION(IncrementArg1) {
+//     arg1_type temp = arg1;
+//     return ++(*temp);
+//   }
+//
+// allows you to write
+//
+//   ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments.  However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action.   For that you can use
+// another macro:
+//
+//   ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+//   ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+//   ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either.  If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'.  For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+//   ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically.  You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>.  This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+//   ACTION_P(Plus, a) { ... }
+//   ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot.  While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run.  They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope.  The reason is
+// that C++ doesn't yet allow function-local types to be used to
+// instantiate templates.  The up-coming C++0x standard will fix this.
+// Once that's done, we'll consider supporting using ACTION*() inside
+// a function.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION'
+// on http://code.google.com/p/googlemock/wiki/CookBook.
+
+// An internal macro needed for implementing ACTION*().
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_\
+    const args_type& args GTEST_ATTRIBUTE_UNUSED_, \
+    arg0_type arg0 GTEST_ATTRIBUTE_UNUSED_, \
+    arg1_type arg1 GTEST_ATTRIBUTE_UNUSED_, \
+    arg2_type arg2 GTEST_ATTRIBUTE_UNUSED_, \
+    arg3_type arg3 GTEST_ATTRIBUTE_UNUSED_, \
+    arg4_type arg4 GTEST_ATTRIBUTE_UNUSED_, \
+    arg5_type arg5 GTEST_ATTRIBUTE_UNUSED_, \
+    arg6_type arg6 GTEST_ATTRIBUTE_UNUSED_, \
+    arg7_type arg7 GTEST_ATTRIBUTE_UNUSED_, \
+    arg8_type arg8 GTEST_ATTRIBUTE_UNUSED_, \
+    arg9_type arg9 GTEST_ATTRIBUTE_UNUSED_
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::testing::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) kind0 name0, kind1 name1, kind2 name2
+#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) kind0 name0, kind1 name1, \
+    kind2 name2, kind3 name3, kind4 name4
+#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+    kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) kind0 name0, kind1 name1, kind2 name2, kind3 name3, \
+    kind4 name4, kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3, kind4 name4, kind5 name5, kind6 name6, kind7 name7, \
+    kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5, \
+    kind6 name6, kind7 name7, kind8 name8, kind9 name9
+
+// Lists the template parameters.
+#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) name0, name1, name2
+#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) name0, name1, name2, name3, \
+    name4
+#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) name0, name1, \
+    name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) name0, name1, name2, name3, name4, name5, \
+    name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) name0, name1, name2, \
+    name3, name4, name5, name6, name7, name8, name9
+
+// Declares the types of value parameters.
+#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) , \
+    typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , \
+    typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type, typename p8##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , typename p0##_type, typename p1##_type, \
+    typename p2##_type, typename p3##_type, typename p4##_type, \
+    typename p5##_type, typename p6##_type, typename p7##_type, \
+    typename p8##_type, typename p9##_type
+
+// Initializes the value parameters.
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS()\
+    ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0)\
+    (p0##_type gmock_p0) : p0(gmock_p0)
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1)\
+    (p0##_type gmock_p0, p1##_type gmock_p1) : p0(gmock_p0), p1(gmock_p1)
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2)
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3)
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4)
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5)
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6)
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+        p7(gmock_p7)
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+        p8(gmock_p8)
+#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+        p9##_type gmock_p9) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+        p8(gmock_p8), p9(gmock_p9)
+
+// Declares the fields for storing the value parameters.
+#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0; \
+    p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0; \
+    p1##_type p1; p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0; \
+    p1##_type p1; p2##_type p2; p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6; p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8; \
+    p9##_type p9;
+
+// Lists the value parameters.
+#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
+#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
+#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
+#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) p0, p1, \
+    p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) p0, \
+    p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+
+// Lists the value parameter types.
+#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) , p0##_type, \
+    p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , p0##_type, \
+    p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+    p6##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type, p9##_type
+
+// Declares the value parameters.
+#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0, \
+    p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0, \
+    p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0, \
+    p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+    p9##_type p9
+
+// The suffix of the class template implementing the action template.
+#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
+#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
+#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
+#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
+#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
+#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
+#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
+#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) P8
+#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) P9
+#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) P10
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+#define ACTION_TEMPLATE(name, template_params, value_params)\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  class GMOCK_ACTION_CLASS_(name, value_params) {\
+   public:\
+    explicit GMOCK_ACTION_CLASS_(name, value_params)\
+        GMOCK_INTERNAL_INIT_##value_params {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      GMOCK_INTERNAL_DEFN_##value_params\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(\
+          new gmock_Impl<F>(GMOCK_INTERNAL_LIST_##value_params));\
+    }\
+    GMOCK_INTERNAL_DEFN_##value_params\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(GMOCK_ACTION_CLASS_(name, value_params));\
+  };\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  inline GMOCK_ACTION_CLASS_(name, value_params)<\
+      GMOCK_INTERNAL_LIST_##template_params\
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(\
+          GMOCK_INTERNAL_DECL_##value_params) {\
+    return GMOCK_ACTION_CLASS_(name, value_params)<\
+        GMOCK_INTERNAL_LIST_##template_params\
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(\
+            GMOCK_INTERNAL_LIST_##value_params);\
+  }\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      GMOCK_ACTION_CLASS_(name, value_params)<\
+          GMOCK_INTERNAL_LIST_##template_params\
+          GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl<F>::\
+              gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION(name)\
+  class name##Action {\
+   public:\
+    name##Action() {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl() {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>());\
+    }\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##Action);\
+  };\
+  inline name##Action name() {\
+    return name##Action();\
+  }\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##Action::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P(name, p0)\
+  template <typename p0##_type>\
+  class name##ActionP {\
+   public:\
+    explicit name##ActionP(p0##_type gmock_p0) : p0(gmock_p0) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl(p0##_type gmock_p0) : p0(gmock_p0) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0));\
+    }\
+    p0##_type p0;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP);\
+  };\
+  template <typename p0##_type>\
+  inline name##ActionP<p0##_type> name(p0##_type p0) {\
+    return name##ActionP<p0##_type>(p0);\
+  }\
+  template <typename p0##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP<p0##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P2(name, p0, p1)\
+  template <typename p0##_type, typename p1##_type>\
+  class name##ActionP2 {\
+   public:\
+    name##ActionP2(p0##_type gmock_p0, p1##_type gmock_p1) : p0(gmock_p0), \
+        p1(gmock_p1) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1) : p0(gmock_p0), \
+          p1(gmock_p1) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP2);\
+  };\
+  template <typename p0##_type, typename p1##_type>\
+  inline name##ActionP2<p0##_type, p1##_type> name(p0##_type p0, \
+      p1##_type p1) {\
+    return name##ActionP2<p0##_type, p1##_type>(p0, p1);\
+  }\
+  template <typename p0##_type, typename p1##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP2<p0##_type, p1##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P3(name, p0, p1, p2)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  class name##ActionP3 {\
+   public:\
+    name##ActionP3(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, \
+          p2##_type gmock_p2) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP3);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  inline name##ActionP3<p0##_type, p1##_type, p2##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2) {\
+    return name##ActionP3<p0##_type, p1##_type, p2##_type>(p0, p1, p2);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP3<p0##_type, p1##_type, \
+          p2##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P4(name, p0, p1, p2, p3)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  class name##ActionP4 {\
+   public:\
+    name##ActionP4(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+          p3(gmock_p3) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP4);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  inline name##ActionP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3) {\
+    return name##ActionP4<p0##_type, p1##_type, p2##_type, p3##_type>(p0, p1, \
+        p2, p3);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP4<p0##_type, p1##_type, p2##_type, \
+          p3##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P5(name, p0, p1, p2, p3, p4)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  class name##ActionP5 {\
+   public:\
+    name##ActionP5(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, \
+        p4##_type gmock_p4) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4) : p0(gmock_p0), \
+          p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), p4(gmock_p4) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP5);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  inline name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4) {\
+    return name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type>(p0, p1, p2, p3, p4);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+          p4##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P6(name, p0, p1, p2, p3, p4, p5)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  class name##ActionP6 {\
+   public:\
+    name##ActionP6(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, \
+          p5##_type gmock_p5) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+          p3(gmock_p3), p4(gmock_p4), p5(gmock_p5) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP6);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  inline name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3, p4##_type p4, p5##_type p5) {\
+    return name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP6<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P7(name, p0, p1, p2, p3, p4, p5, p6)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  class name##ActionP7 {\
+   public:\
+    name##ActionP7(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), \
+        p6(gmock_p6) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+          p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP7);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  inline name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type> name(p0##_type p0, p1##_type p1, \
+      p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6) {\
+    return name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, p6);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP7<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P8(name, p0, p1, p2, p3, p4, p5, p6, p7)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  class name##ActionP8 {\
+   public:\
+    name##ActionP8(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, \
+        p7##_type gmock_p7) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+        p7(gmock_p7) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7) : p0(gmock_p0), \
+          p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), \
+          p5(gmock_p5), p6(gmock_p6), p7(gmock_p7) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP8);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  inline name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6, p7##_type p7) {\
+    return name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, p3, p4, p5, \
+        p6, p7);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP8<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, \
+          p7##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  class name##ActionP9 {\
+   public:\
+    name##ActionP9(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+        p8(gmock_p8) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, \
+          p8##_type gmock_p8) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+          p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+          p7(gmock_p7), p8(gmock_p8) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7, p8));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP9);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  inline name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, \
+      p8##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, \
+      p8##_type p8) {\
+    return name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type>(p0, p1, p2, \
+        p3, p4, p5, p6, p7, p8);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP9<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, p7##_type, \
+          p8##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  class name##ActionP10 {\
+   public:\
+    name##ActionP10(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8, p9##_type gmock_p9) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+        p7(gmock_p7), p8(gmock_p8), p9(gmock_p9) {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+          p9##_type gmock_p9) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+          p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+          p7(gmock_p7), p8(gmock_p8), p9(gmock_p9) {}\
+      virtual return_type Perform(const args_type& args) {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <typename arg0_type, typename arg1_type, typename arg2_type, \
+          typename arg3_type, typename arg4_type, typename arg5_type, \
+          typename arg6_type, typename arg7_type, typename arg8_type, \
+          typename arg9_type>\
+      return_type gmock_PerformImpl(const args_type& args, arg0_type arg0, \
+          arg1_type arg1, arg2_type arg2, arg3_type arg3, arg4_type arg4, \
+          arg5_type arg5, arg6_type arg6, arg7_type arg7, arg8_type arg8, \
+          arg9_type arg9) const;\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+      p9##_type p9;\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(new gmock_Impl<F>(p0, p1, p2, p3, p4, p5, \
+          p6, p7, p8, p9));\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+    p9##_type p9;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##ActionP10);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  inline name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+      p9##_type p9) {\
+    return name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, p9##_type>(p0, \
+        p1, p2, p3, p4, p5, p6, p7, p8, p9);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  template <typename F>\
+  template <typename arg0_type, typename arg1_type, typename arg2_type, \
+      typename arg3_type, typename arg4_type, typename arg5_type, \
+      typename arg6_type, typename arg7_type, typename arg8_type, \
+      typename arg9_type>\
+  typename ::testing::internal::Function<F>::Result\
+      name##ActionP10<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+          p5##_type, p6##_type, p7##_type, p8##_type, \
+          p9##_type>::gmock_Impl<F>::gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+namespace testing {
+
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Various overloads for InvokeArgument<N>().
+//
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside ByRef().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), ByRef(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but ByRef() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+
+namespace internal {
+namespace invoke_argument {
+
+// Appears in InvokeArgumentAdl's argument list to help avoid
+// accidental calls to user functions of the same name.
+struct AdlTag {};
+
+// InvokeArgumentAdl - a helper for InvokeArgument.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/callback-actions.h header.
+
+template <typename R, typename F>
+R InvokeArgumentAdl(AdlTag, F f) {
+  return f();
+}
+template <typename R, typename F, typename A1>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1) {
+  return f(a1);
+}
+template <typename R, typename F, typename A1, typename A2>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2) {
+  return f(a1, a2);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3) {
+  return f(a1, a2, a3);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4) {
+  return f(a1, a2, a3, a4);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5) {
+  return f(a1, a2, a3, a4, a5);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6) {
+  return f(a1, a2, a3, a4, a5, a6);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7) {
+  return f(a1, a2, a3, a4, a5, a6, a7);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8,
+    typename A9>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8, A9 a9) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8, a9);
+}
+template <typename R, typename F, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8,
+    typename A9, typename A10>
+R InvokeArgumentAdl(AdlTag, F f, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6,
+    A7 a7, A8 a8, A9 a9, A10 a10) {
+  return f(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10);
+}
+}  // namespace invoke_argument
+}  // namespace internal
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args));
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(p0)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7, p8);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl<return_type>(
+      internal::invoke_argument::AdlTag(),
+      ::testing::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7, p8, p9);
+}
+
+// Various overloads for ReturnNew<T>().
+//
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_0_VALUE_PARAMS()) {
+  return new T();
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_1_VALUE_PARAMS(p0)) {
+  return new T(p0);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  return new T(p0, p1);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  return new T(p0, p1, p2);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  return new T(p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  return new T(p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  return new T(p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Include any custom actions added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-generated-function-mockers.h b/utils/unittest/googlemock/include/gmock/gmock-generated-function-mockers.h
new file mode 100644
index 000000000000..4fa5ca94849f
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-generated-function-mockers.h
@@ -0,0 +1,1095 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-function-mockers.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements function mockers of various arities.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+#if GTEST_HAS_STD_FUNCTION_
+# include <functional>
+#endif
+
+namespace testing {
+namespace internal {
+
+template <typename F>
+class FunctionMockerBase;
+
+// Note: class FunctionMocker really belongs to the ::testing
+// namespace.  However if we define it in ::testing, MSVC will
+// complain when classes in ::testing::internal declare it as a
+// friend class template.  To workaround this compiler bug, we define
+// FunctionMocker in ::testing::internal and import it into ::testing.
+template <typename F>
+class FunctionMocker;
+
+template <typename R>
+class FunctionMocker<R()> : public
+    internal::FunctionMockerBase<R()> {
+ public:
+  typedef R F();
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With() {
+    return this->current_spec();
+  }
+
+  R Invoke() {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple());
+  }
+};
+
+template <typename R, typename A1>
+class FunctionMocker<R(A1)> : public
+    internal::FunctionMockerBase<R(A1)> {
+ public:
+  typedef R F(A1);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1));
+  }
+};
+
+template <typename R, typename A1, typename A2>
+class FunctionMocker<R(A1, A2)> : public
+    internal::FunctionMockerBase<R(A1, A2)> {
+ public:
+  typedef R F(A1, A2);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3>
+class FunctionMocker<R(A1, A2, A3)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3)> {
+ public:
+  typedef R F(A1, A2, A3);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4>
+class FunctionMocker<R(A1, A2, A3, A4)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4)> {
+ public:
+  typedef R F(A1, A2, A3, A4);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5>
+class FunctionMocker<R(A1, A2, A3, A4, A5)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6>
+class FunctionMocker<R(A1, A2, A3, A4, A5, A6)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5, A6)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5, A6);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5,
+      const Matcher<A6>& m6) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5,
+        m6));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5, a6));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7>
+class FunctionMocker<R(A1, A2, A3, A4, A5, A6, A7)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5, A6, A7)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5, A6, A7);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5,
+      const Matcher<A6>& m6, const Matcher<A7>& m7) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5,
+        m6, m7));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5, a6, a7));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8>
+class FunctionMocker<R(A1, A2, A3, A4, A5, A6, A7, A8)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5, A6, A7, A8)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5, A6, A7, A8);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5,
+      const Matcher<A6>& m6, const Matcher<A7>& m7, const Matcher<A8>& m8) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5,
+        m6, m7, m8));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7, A8 a8) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5, a6, a7, a8));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9>
+class FunctionMocker<R(A1, A2, A3, A4, A5, A6, A7, A8, A9)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5, A6, A7, A8, A9)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5, A6, A7, A8, A9);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5,
+      const Matcher<A6>& m6, const Matcher<A7>& m7, const Matcher<A8>& m8,
+      const Matcher<A9>& m9) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5,
+        m6, m7, m8, m9));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7, A8 a8, A9 a9) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5, a6, a7, a8, a9));
+  }
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9,
+    typename A10>
+class FunctionMocker<R(A1, A2, A3, A4, A5, A6, A7, A8, A9, A10)> : public
+    internal::FunctionMockerBase<R(A1, A2, A3, A4, A5, A6, A7, A8, A9, A10)> {
+ public:
+  typedef R F(A1, A2, A3, A4, A5, A6, A7, A8, A9, A10);
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  MockSpec<F>& With(const Matcher<A1>& m1, const Matcher<A2>& m2,
+      const Matcher<A3>& m3, const Matcher<A4>& m4, const Matcher<A5>& m5,
+      const Matcher<A6>& m6, const Matcher<A7>& m7, const Matcher<A8>& m8,
+      const Matcher<A9>& m9, const Matcher<A10>& m10) {
+    this->current_spec().SetMatchers(::testing::make_tuple(m1, m2, m3, m4, m5,
+        m6, m7, m8, m9, m10));
+    return this->current_spec();
+  }
+
+  R Invoke(A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7, A8 a8, A9 a9,
+      A10 a10) {
+    // Even though gcc and MSVC don't enforce it, 'this->' is required
+    // by the C++ standard [14.6.4] here, as the base class type is
+    // dependent on the template argument (and thus shouldn't be
+    // looked into when resolving InvokeWith).
+    return this->InvokeWith(ArgumentTuple(a1, a2, a3, a4, a5, a6, a7, a8, a9,
+        a10));
+  }
+};
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace.  The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+
+// GMOCK_RESULT_(tn, F) expands to the result type of function type F.
+// We define this as a variadic macro in case F contains unprotected
+// commas (the same reason that we use variadic macros in other places
+// in this file).
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_RESULT_(tn, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::Result
+
+// The type of argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_ARG_(tn, N, ...) \
+    tn ::testing::internal::Function<__VA_ARGS__>::Argument##N
+
+// The matcher type for argument N of the given function type.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MATCHER_(tn, N, ...) \
+    const ::testing::Matcher<GMOCK_ARG_(tn, N, __VA_ARGS__)>&
+
+// The variable for mocking the given method.
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_MOCKER_(arity, constness, Method) \
+    GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD0_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      ) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 0), \
+        this_method_does_not_take_0_arguments); \
+    GMOCK_MOCKER_(0, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(0, constness, Method).Invoke(); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method() constness { \
+    GMOCK_MOCKER_(0, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(0, constness, Method).With(); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(0, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD1_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 1), \
+        this_method_does_not_take_1_argument); \
+    GMOCK_MOCKER_(1, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(1, constness, Method).Invoke(gmock_a1); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1) constness { \
+    GMOCK_MOCKER_(1, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(1, constness, Method).With(gmock_a1); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(1, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD2_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 2), \
+        this_method_does_not_take_2_arguments); \
+    GMOCK_MOCKER_(2, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(2, constness, Method).Invoke(gmock_a1, gmock_a2); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2) constness { \
+    GMOCK_MOCKER_(2, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(2, constness, Method).With(gmock_a1, gmock_a2); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(2, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD3_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 3), \
+        this_method_does_not_take_3_arguments); \
+    GMOCK_MOCKER_(3, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(3, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3) constness { \
+    GMOCK_MOCKER_(3, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(3, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(3, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD4_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 4), \
+        this_method_does_not_take_4_arguments); \
+    GMOCK_MOCKER_(4, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(4, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4) constness { \
+    GMOCK_MOCKER_(4, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(4, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(4, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD5_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 5), \
+        this_method_does_not_take_5_arguments); \
+    GMOCK_MOCKER_(5, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(5, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5) constness { \
+    GMOCK_MOCKER_(5, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(5, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(5, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD6_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5, \
+      GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 6), \
+        this_method_does_not_take_6_arguments); \
+    GMOCK_MOCKER_(6, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(6, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6) constness { \
+    GMOCK_MOCKER_(6, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(6, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(6, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD7_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5, \
+      GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+      GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 7), \
+        this_method_does_not_take_7_arguments); \
+    GMOCK_MOCKER_(7, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(7, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7) constness { \
+    GMOCK_MOCKER_(7, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(7, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(7, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD8_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5, \
+      GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+      GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, \
+      GMOCK_ARG_(tn, 8, __VA_ARGS__) gmock_a8) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 8), \
+        this_method_does_not_take_8_arguments); \
+    GMOCK_MOCKER_(8, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(8, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8) constness { \
+    GMOCK_MOCKER_(8, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(8, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(8, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD9_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5, \
+      GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+      GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, \
+      GMOCK_ARG_(tn, 8, __VA_ARGS__) gmock_a8, \
+      GMOCK_ARG_(tn, 9, __VA_ARGS__) gmock_a9) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 9), \
+        this_method_does_not_take_9_arguments); \
+    GMOCK_MOCKER_(9, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(9, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, \
+        gmock_a9); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8, \
+                     GMOCK_MATCHER_(tn, 9, __VA_ARGS__) gmock_a9) constness { \
+    GMOCK_MOCKER_(9, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(9, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, \
+        gmock_a9); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(9, constness, \
+      Method)
+
+// INTERNAL IMPLEMENTATION - DON'T USE IN USER CODE!!!
+#define GMOCK_METHOD10_(tn, constness, ct, Method, ...) \
+  GMOCK_RESULT_(tn, __VA_ARGS__) ct Method( \
+      GMOCK_ARG_(tn, 1, __VA_ARGS__) gmock_a1, \
+      GMOCK_ARG_(tn, 2, __VA_ARGS__) gmock_a2, \
+      GMOCK_ARG_(tn, 3, __VA_ARGS__) gmock_a3, \
+      GMOCK_ARG_(tn, 4, __VA_ARGS__) gmock_a4, \
+      GMOCK_ARG_(tn, 5, __VA_ARGS__) gmock_a5, \
+      GMOCK_ARG_(tn, 6, __VA_ARGS__) gmock_a6, \
+      GMOCK_ARG_(tn, 7, __VA_ARGS__) gmock_a7, \
+      GMOCK_ARG_(tn, 8, __VA_ARGS__) gmock_a8, \
+      GMOCK_ARG_(tn, 9, __VA_ARGS__) gmock_a9, \
+      GMOCK_ARG_(tn, 10, __VA_ARGS__) gmock_a10) constness { \
+    GTEST_COMPILE_ASSERT_((::testing::tuple_size<                          \
+        tn ::testing::internal::Function<__VA_ARGS__>::ArgumentTuple>::value \
+            == 10), \
+        this_method_does_not_take_10_arguments); \
+    GMOCK_MOCKER_(10, constness, Method).SetOwnerAndName(this, #Method); \
+    return GMOCK_MOCKER_(10, constness, Method).Invoke(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, gmock_a9, \
+        gmock_a10); \
+  } \
+  ::testing::MockSpec<__VA_ARGS__>& \
+      gmock_##Method(GMOCK_MATCHER_(tn, 1, __VA_ARGS__) gmock_a1, \
+                     GMOCK_MATCHER_(tn, 2, __VA_ARGS__) gmock_a2, \
+                     GMOCK_MATCHER_(tn, 3, __VA_ARGS__) gmock_a3, \
+                     GMOCK_MATCHER_(tn, 4, __VA_ARGS__) gmock_a4, \
+                     GMOCK_MATCHER_(tn, 5, __VA_ARGS__) gmock_a5, \
+                     GMOCK_MATCHER_(tn, 6, __VA_ARGS__) gmock_a6, \
+                     GMOCK_MATCHER_(tn, 7, __VA_ARGS__) gmock_a7, \
+                     GMOCK_MATCHER_(tn, 8, __VA_ARGS__) gmock_a8, \
+                     GMOCK_MATCHER_(tn, 9, __VA_ARGS__) gmock_a9, \
+                     GMOCK_MATCHER_(tn, 10, \
+                         __VA_ARGS__) gmock_a10) constness { \
+    GMOCK_MOCKER_(10, constness, Method).RegisterOwner(this); \
+    return GMOCK_MOCKER_(10, constness, Method).With(gmock_a1, gmock_a2, \
+        gmock_a3, gmock_a4, gmock_a5, gmock_a6, gmock_a7, gmock_a8, gmock_a9, \
+        gmock_a10); \
+  } \
+  mutable ::testing::FunctionMocker<__VA_ARGS__> GMOCK_MOCKER_(10, constness, \
+      Method)
+
+#define MOCK_METHOD0(m, ...) GMOCK_METHOD0_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD1(m, ...) GMOCK_METHOD1_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD2(m, ...) GMOCK_METHOD2_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD3(m, ...) GMOCK_METHOD3_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD4(m, ...) GMOCK_METHOD4_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD5(m, ...) GMOCK_METHOD5_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD6(m, ...) GMOCK_METHOD6_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD7(m, ...) GMOCK_METHOD7_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD8(m, ...) GMOCK_METHOD8_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD9(m, ...) GMOCK_METHOD9_(, , , m, __VA_ARGS__)
+#define MOCK_METHOD10(m, ...) GMOCK_METHOD10_(, , , m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0(m, ...) GMOCK_METHOD0_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1(m, ...) GMOCK_METHOD1_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2(m, ...) GMOCK_METHOD2_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3(m, ...) GMOCK_METHOD3_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4(m, ...) GMOCK_METHOD4_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5(m, ...) GMOCK_METHOD5_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6(m, ...) GMOCK_METHOD6_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7(m, ...) GMOCK_METHOD7_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8(m, ...) GMOCK_METHOD8_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9(m, ...) GMOCK_METHOD9_(, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10(m, ...) GMOCK_METHOD10_(, const, , m, __VA_ARGS__)
+
+#define MOCK_METHOD0_T(m, ...) GMOCK_METHOD0_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD1_T(m, ...) GMOCK_METHOD1_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD2_T(m, ...) GMOCK_METHOD2_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD3_T(m, ...) GMOCK_METHOD3_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD4_T(m, ...) GMOCK_METHOD4_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD5_T(m, ...) GMOCK_METHOD5_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD6_T(m, ...) GMOCK_METHOD6_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD7_T(m, ...) GMOCK_METHOD7_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD8_T(m, ...) GMOCK_METHOD8_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD9_T(m, ...) GMOCK_METHOD9_(typename, , , m, __VA_ARGS__)
+#define MOCK_METHOD10_T(m, ...) GMOCK_METHOD10_(typename, , , m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T(m, ...) \
+    GMOCK_METHOD0_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T(m, ...) \
+    GMOCK_METHOD1_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T(m, ...) \
+    GMOCK_METHOD2_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T(m, ...) \
+    GMOCK_METHOD3_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T(m, ...) \
+    GMOCK_METHOD4_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T(m, ...) \
+    GMOCK_METHOD5_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T(m, ...) \
+    GMOCK_METHOD6_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T(m, ...) \
+    GMOCK_METHOD7_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T(m, ...) \
+    GMOCK_METHOD8_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T(m, ...) \
+    GMOCK_METHOD9_(typename, const, , m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T(m, ...) \
+    GMOCK_METHOD10_(typename, const, , m, __VA_ARGS__)
+
+#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(, , ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(, const, ct, m, __VA_ARGS__)
+
+#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(typename, , ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(typename, , ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD0_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD1_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD2_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD3_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD4_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD5_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD6_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD7_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD8_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD9_(typename, const, ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+    GMOCK_METHOD10_(typename, const, ct, m, __VA_ARGS__)
+
+// A MockFunction<F> class has one mock method whose type is F.  It is
+// useful when you just want your test code to emit some messages and
+// have Google Mock verify the right messages are sent (and perhaps at
+// the right times).  For example, if you are exercising code:
+//
+//   Foo(1);
+//   Foo(2);
+//   Foo(3);
+//
+// and want to verify that Foo(1) and Foo(3) both invoke
+// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
+//
+// TEST(FooTest, InvokesBarCorrectly) {
+//   MyMock mock;
+//   MockFunction<void(string check_point_name)> check;
+//   {
+//     InSequence s;
+//
+//     EXPECT_CALL(mock, Bar("a"));
+//     EXPECT_CALL(check, Call("1"));
+//     EXPECT_CALL(check, Call("2"));
+//     EXPECT_CALL(mock, Bar("a"));
+//   }
+//   Foo(1);
+//   check.Call("1");
+//   Foo(2);
+//   check.Call("2");
+//   Foo(3);
+// }
+//
+// The expectation spec says that the first Bar("a") must happen
+// before check point "1", the second Bar("a") must happen after check
+// point "2", and nothing should happen between the two check
+// points. The explicit check points make it easy to tell which
+// Bar("a") is called by which call to Foo().
+//
+// MockFunction<F> can also be used to exercise code that accepts
+// std::function<F> callbacks. To do so, use AsStdFunction() method
+// to create std::function proxy forwarding to original object's Call.
+// Example:
+//
+// TEST(FooTest, RunsCallbackWithBarArgument) {
+//   MockFunction<int(string)> callback;
+//   EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
+//   Foo(callback.AsStdFunction());
+// }
+template <typename F>
+class MockFunction;
+
+template <typename R>
+class MockFunction<R()> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD0_T(Call, R());
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R()> AsStdFunction() {
+    return [this]() -> R {
+      return this->Call();
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0>
+class MockFunction<R(A0)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD1_T(Call, R(A0));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0)> AsStdFunction() {
+    return [this](A0 a0) -> R {
+      return this->Call(a0);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1>
+class MockFunction<R(A0, A1)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD2_T(Call, R(A0, A1));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1)> AsStdFunction() {
+    return [this](A0 a0, A1 a1) -> R {
+      return this->Call(a0, a1);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2>
+class MockFunction<R(A0, A1, A2)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD3_T(Call, R(A0, A1, A2));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2) -> R {
+      return this->Call(a0, a1, a2);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3>
+class MockFunction<R(A0, A1, A2, A3)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD4_T(Call, R(A0, A1, A2, A3));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3) -> R {
+      return this->Call(a0, a1, a2, a3);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4>
+class MockFunction<R(A0, A1, A2, A3, A4)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD5_T(Call, R(A0, A1, A2, A3, A4));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4) -> R {
+      return this->Call(a0, a1, a2, a3, a4);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4, typename A5>
+class MockFunction<R(A0, A1, A2, A3, A4, A5)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD6_T(Call, R(A0, A1, A2, A3, A4, A5));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4, A5)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5) -> R {
+      return this->Call(a0, a1, a2, a3, a4, a5);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6>
+class MockFunction<R(A0, A1, A2, A3, A4, A5, A6)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD7_T(Call, R(A0, A1, A2, A3, A4, A5, A6));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4, A5, A6)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6) -> R {
+      return this->Call(a0, a1, a2, a3, a4, a5, a6);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7>
+class MockFunction<R(A0, A1, A2, A3, A4, A5, A6, A7)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD8_T(Call, R(A0, A1, A2, A3, A4, A5, A6, A7));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4, A5, A6, A7)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7) -> R {
+      return this->Call(a0, a1, a2, a3, a4, a5, a6, a7);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8>
+class MockFunction<R(A0, A1, A2, A3, A4, A5, A6, A7, A8)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD9_T(Call, R(A0, A1, A2, A3, A4, A5, A6, A7, A8));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4, A5, A6, A7, A8)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7,
+        A8 a8) -> R {
+      return this->Call(a0, a1, a2, a3, a4, a5, a6, a7, a8);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+template <typename R, typename A0, typename A1, typename A2, typename A3,
+    typename A4, typename A5, typename A6, typename A7, typename A8,
+    typename A9>
+class MockFunction<R(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9)> {
+ public:
+  MockFunction() {}
+
+  MOCK_METHOD10_T(Call, R(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9));
+
+#if GTEST_HAS_STD_FUNCTION_
+  std::function<R(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9)> AsStdFunction() {
+    return [this](A0 a0, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, A6 a6, A7 a7,
+        A8 a8, A9 a9) -> R {
+      return this->Call(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9);
+    };
+  }
+#endif  // GTEST_HAS_STD_FUNCTION_
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFunction);
+};
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_FUNCTION_MOCKERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-generated-matchers.h b/utils/unittest/googlemock/include/gmock/gmock-generated-matchers.h
new file mode 100644
index 000000000000..57056fd91d22
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-generated-matchers.h
@@ -0,0 +1,2179 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-matchers.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic matchers.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
+
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "gmock/gmock-matchers.h"
+
+namespace testing {
+namespace internal {
+
+// The type of the i-th (0-based) field of Tuple.
+#define GMOCK_FIELD_TYPE_(Tuple, i) \
+    typename ::testing::tuple_element<i, Tuple>::type
+
+// TupleFields<Tuple, k0, ..., kn> is for selecting fields from a
+// tuple of type Tuple.  It has two members:
+//
+//   type: a tuple type whose i-th field is the ki-th field of Tuple.
+//   GetSelectedFields(t): returns fields k0, ..., and kn of t as a tuple.
+//
+// For example, in class TupleFields<tuple<bool, char, int>, 2, 0>, we have:
+//
+//   type is tuple<int, bool>, and
+//   GetSelectedFields(make_tuple(true, 'a', 42)) is (42, true).
+
+template <class Tuple, int k0 = -1, int k1 = -1, int k2 = -1, int k3 = -1,
+    int k4 = -1, int k5 = -1, int k6 = -1, int k7 = -1, int k8 = -1,
+    int k9 = -1>
+class TupleFields;
+
+// This generic version is used when there are 10 selectors.
+template <class Tuple, int k0, int k1, int k2, int k3, int k4, int k5, int k6,
+    int k7, int k8, int k9>
+class TupleFields {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4),
+      GMOCK_FIELD_TYPE_(Tuple, k5), GMOCK_FIELD_TYPE_(Tuple, k6),
+      GMOCK_FIELD_TYPE_(Tuple, k7), GMOCK_FIELD_TYPE_(Tuple, k8),
+      GMOCK_FIELD_TYPE_(Tuple, k9)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t),
+        get<k5>(t), get<k6>(t), get<k7>(t), get<k8>(t), get<k9>(t));
+  }
+};
+
+// The following specialization is used for 0 ~ 9 selectors.
+
+template <class Tuple>
+class TupleFields<Tuple, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<> type;
+  static type GetSelectedFields(const Tuple& /* t */) {
+    return type();
+  }
+};
+
+template <class Tuple, int k0>
+class TupleFields<Tuple, k0, -1, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1>
+class TupleFields<Tuple, k0, k1, -1, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2>
+class TupleFields<Tuple, k0, k1, k2, -1, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3>
+class TupleFields<Tuple, k0, k1, k2, k3, -1, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3, int k4>
+class TupleFields<Tuple, k0, k1, k2, k3, k4, -1, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3, int k4, int k5>
+class TupleFields<Tuple, k0, k1, k2, k3, k4, k5, -1, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4),
+      GMOCK_FIELD_TYPE_(Tuple, k5)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t),
+        get<k5>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3, int k4, int k5, int k6>
+class TupleFields<Tuple, k0, k1, k2, k3, k4, k5, k6, -1, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4),
+      GMOCK_FIELD_TYPE_(Tuple, k5), GMOCK_FIELD_TYPE_(Tuple, k6)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t),
+        get<k5>(t), get<k6>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3, int k4, int k5, int k6,
+    int k7>
+class TupleFields<Tuple, k0, k1, k2, k3, k4, k5, k6, k7, -1, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4),
+      GMOCK_FIELD_TYPE_(Tuple, k5), GMOCK_FIELD_TYPE_(Tuple, k6),
+      GMOCK_FIELD_TYPE_(Tuple, k7)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t),
+        get<k5>(t), get<k6>(t), get<k7>(t));
+  }
+};
+
+template <class Tuple, int k0, int k1, int k2, int k3, int k4, int k5, int k6,
+    int k7, int k8>
+class TupleFields<Tuple, k0, k1, k2, k3, k4, k5, k6, k7, k8, -1> {
+ public:
+  typedef ::testing::tuple<GMOCK_FIELD_TYPE_(Tuple, k0),
+      GMOCK_FIELD_TYPE_(Tuple, k1), GMOCK_FIELD_TYPE_(Tuple, k2),
+      GMOCK_FIELD_TYPE_(Tuple, k3), GMOCK_FIELD_TYPE_(Tuple, k4),
+      GMOCK_FIELD_TYPE_(Tuple, k5), GMOCK_FIELD_TYPE_(Tuple, k6),
+      GMOCK_FIELD_TYPE_(Tuple, k7), GMOCK_FIELD_TYPE_(Tuple, k8)> type;
+  static type GetSelectedFields(const Tuple& t) {
+    return type(get<k0>(t), get<k1>(t), get<k2>(t), get<k3>(t), get<k4>(t),
+        get<k5>(t), get<k6>(t), get<k7>(t), get<k8>(t));
+  }
+};
+
+#undef GMOCK_FIELD_TYPE_
+
+// Implements the Args() matcher.
+template <class ArgsTuple, int k0 = -1, int k1 = -1, int k2 = -1, int k3 = -1,
+    int k4 = -1, int k5 = -1, int k6 = -1, int k7 = -1, int k8 = -1,
+    int k9 = -1>
+class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
+ public:
+  // ArgsTuple may have top-level const or reference modifiers.
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(ArgsTuple) RawArgsTuple;
+  typedef typename internal::TupleFields<RawArgsTuple, k0, k1, k2, k3, k4, k5,
+      k6, k7, k8, k9>::type SelectedArgs;
+  typedef Matcher<const SelectedArgs&> MonomorphicInnerMatcher;
+
+  template <typename InnerMatcher>
+  explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
+      : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
+
+  virtual bool MatchAndExplain(ArgsTuple args,
+                               MatchResultListener* listener) const {
+    const SelectedArgs& selected_args = GetSelectedArgs(args);
+    if (!listener->IsInterested())
+      return inner_matcher_.Matches(selected_args);
+
+    PrintIndices(listener->stream());
+    *listener << "are " << PrintToString(selected_args);
+
+    StringMatchResultListener inner_listener;
+    const bool match = inner_matcher_.MatchAndExplain(selected_args,
+                                                      &inner_listener);
+    PrintIfNotEmpty(inner_listener.str(), listener->stream());
+    return match;
+  }
+
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeTo(os);
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static SelectedArgs GetSelectedArgs(ArgsTuple args) {
+    return TupleFields<RawArgsTuple, k0, k1, k2, k3, k4, k5, k6, k7, k8,
+        k9>::GetSelectedFields(args);
+  }
+
+  // Prints the indices of the selected fields.
+  static void PrintIndices(::std::ostream* os) {
+    *os << "whose fields (";
+    const int indices[10] = { k0, k1, k2, k3, k4, k5, k6, k7, k8, k9 };
+    for (int i = 0; i < 10; i++) {
+      if (indices[i] < 0)
+        break;
+
+      if (i >= 1)
+        *os << ", ";
+
+      *os << "#" << indices[i];
+    }
+    *os << ") ";
+  }
+
+  const MonomorphicInnerMatcher inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ArgsMatcherImpl);
+};
+
+template <class InnerMatcher, int k0 = -1, int k1 = -1, int k2 = -1,
+    int k3 = -1, int k4 = -1, int k5 = -1, int k6 = -1, int k7 = -1,
+    int k8 = -1, int k9 = -1>
+class ArgsMatcher {
+ public:
+  explicit ArgsMatcher(const InnerMatcher& inner_matcher)
+      : inner_matcher_(inner_matcher) {}
+
+  template <typename ArgsTuple>
+  operator Matcher<ArgsTuple>() const {
+    return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k0, k1, k2, k3, k4, k5,
+        k6, k7, k8, k9>(inner_matcher_));
+  }
+
+ private:
+  const InnerMatcher inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ArgsMatcher);
+};
+
+// A set of metafunctions for computing the result type of AllOf.
+// AllOf(m1, ..., mN) returns
+// AllOfResultN<decltype(m1), ..., decltype(mN)>::type.
+
+// Although AllOf isn't defined for one argument, AllOfResult1 is defined
+// to simplify the implementation.
+template <typename M1>
+struct AllOfResult1 {
+  typedef M1 type;
+};
+
+template <typename M1, typename M2>
+struct AllOfResult2 {
+  typedef BothOfMatcher<
+      typename AllOfResult1<M1>::type,
+      typename AllOfResult1<M2>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3>
+struct AllOfResult3 {
+  typedef BothOfMatcher<
+      typename AllOfResult1<M1>::type,
+      typename AllOfResult2<M2, M3>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4>
+struct AllOfResult4 {
+  typedef BothOfMatcher<
+      typename AllOfResult2<M1, M2>::type,
+      typename AllOfResult2<M3, M4>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5>
+struct AllOfResult5 {
+  typedef BothOfMatcher<
+      typename AllOfResult2<M1, M2>::type,
+      typename AllOfResult3<M3, M4, M5>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6>
+struct AllOfResult6 {
+  typedef BothOfMatcher<
+      typename AllOfResult3<M1, M2, M3>::type,
+      typename AllOfResult3<M4, M5, M6>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7>
+struct AllOfResult7 {
+  typedef BothOfMatcher<
+      typename AllOfResult3<M1, M2, M3>::type,
+      typename AllOfResult4<M4, M5, M6, M7>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8>
+struct AllOfResult8 {
+  typedef BothOfMatcher<
+      typename AllOfResult4<M1, M2, M3, M4>::type,
+      typename AllOfResult4<M5, M6, M7, M8>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9>
+struct AllOfResult9 {
+  typedef BothOfMatcher<
+      typename AllOfResult4<M1, M2, M3, M4>::type,
+      typename AllOfResult5<M5, M6, M7, M8, M9>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9, typename M10>
+struct AllOfResult10 {
+  typedef BothOfMatcher<
+      typename AllOfResult5<M1, M2, M3, M4, M5>::type,
+      typename AllOfResult5<M6, M7, M8, M9, M10>::type
+  > type;
+};
+
+// A set of metafunctions for computing the result type of AnyOf.
+// AnyOf(m1, ..., mN) returns
+// AnyOfResultN<decltype(m1), ..., decltype(mN)>::type.
+
+// Although AnyOf isn't defined for one argument, AnyOfResult1 is defined
+// to simplify the implementation.
+template <typename M1>
+struct AnyOfResult1 {
+  typedef M1 type;
+};
+
+template <typename M1, typename M2>
+struct AnyOfResult2 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult1<M1>::type,
+      typename AnyOfResult1<M2>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3>
+struct AnyOfResult3 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult1<M1>::type,
+      typename AnyOfResult2<M2, M3>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4>
+struct AnyOfResult4 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult2<M1, M2>::type,
+      typename AnyOfResult2<M3, M4>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5>
+struct AnyOfResult5 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult2<M1, M2>::type,
+      typename AnyOfResult3<M3, M4, M5>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6>
+struct AnyOfResult6 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult3<M1, M2, M3>::type,
+      typename AnyOfResult3<M4, M5, M6>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7>
+struct AnyOfResult7 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult3<M1, M2, M3>::type,
+      typename AnyOfResult4<M4, M5, M6, M7>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8>
+struct AnyOfResult8 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult4<M1, M2, M3, M4>::type,
+      typename AnyOfResult4<M5, M6, M7, M8>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9>
+struct AnyOfResult9 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult4<M1, M2, M3, M4>::type,
+      typename AnyOfResult5<M5, M6, M7, M8, M9>::type
+  > type;
+};
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9, typename M10>
+struct AnyOfResult10 {
+  typedef EitherOfMatcher<
+      typename AnyOfResult5<M1, M2, M3, M4, M5>::type,
+      typename AnyOfResult5<M6, M7, M8, M9, M10>::type
+  > type;
+};
+
+}  // namespace internal
+
+// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
+// fields of it matches a_matcher.  C++ doesn't support default
+// arguments for function templates, so we have to overload it.
+template <typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher>(matcher);
+}
+
+template <int k1, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1>(matcher);
+}
+
+template <int k1, int k2, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2>(matcher);
+}
+
+template <int k1, int k2, int k3, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7,
+    typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6,
+      k7>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7, k8>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7,
+      k8>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    int k9, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7, k8, k9>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7, k8,
+      k9>(matcher);
+}
+
+template <int k1, int k2, int k3, int k4, int k5, int k6, int k7, int k8,
+    int k9, int k10, typename InnerMatcher>
+inline internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7, k8, k9,
+    k10>
+Args(const InnerMatcher& matcher) {
+  return internal::ArgsMatcher<InnerMatcher, k1, k2, k3, k4, k5, k6, k7, k8,
+      k9, k10>(matcher);
+}
+
+// ElementsAre(e_1, e_2, ... e_n) matches an STL-style container with
+// n elements, where the i-th element in the container must
+// match the i-th argument in the list.  Each argument of
+// ElementsAre() can be either a value or a matcher.  We support up to
+// 10 arguments.
+//
+// The use of DecayArray in the implementation allows ElementsAre()
+// to accept string literals, whose type is const char[N], but we
+// want to treat them as const char*.
+//
+// NOTE: Since ElementsAre() cares about the order of the elements, it
+// must not be used with containers whose elements's order is
+// undefined (e.g. hash_map).
+
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<> >
+ElementsAre() {
+  typedef ::testing::tuple<> Args;
+  return internal::ElementsAreMatcher<Args>(Args());
+}
+
+template <typename T1>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type> >
+ElementsAre(const T1& e1) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1));
+}
+
+template <typename T1, typename T2>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type> >
+ElementsAre(const T1& e1, const T2& e2) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2));
+}
+
+template <typename T1, typename T2, typename T3>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3));
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5, e6));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5, e6, e7));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5, e6, e7,
+      e8));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type,
+        typename internal::DecayArray<T9>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8, const T9& e9) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type,
+      typename internal::DecayArray<T9>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5, e6, e7,
+      e8, e9));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+inline internal::ElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type,
+        typename internal::DecayArray<T9>::type,
+        typename internal::DecayArray<T10>::type> >
+ElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8, const T9& e9,
+    const T10& e10) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type,
+      typename internal::DecayArray<T9>::type,
+      typename internal::DecayArray<T10>::type> Args;
+  return internal::ElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5, e6, e7,
+      e8, e9, e10));
+}
+
+// UnorderedElementsAre(e_1, e_2, ..., e_n) is an ElementsAre extension
+// that matches n elements in any order.  We support up to n=10 arguments.
+
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<> >
+UnorderedElementsAre() {
+  typedef ::testing::tuple<> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args());
+}
+
+template <typename T1>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type> >
+UnorderedElementsAre(const T1& e1) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1));
+}
+
+template <typename T1, typename T2>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2));
+}
+
+template <typename T1, typename T2, typename T3>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3));
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5,
+      e6));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5,
+      e6, e7));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5,
+      e6, e7, e8));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type,
+        typename internal::DecayArray<T9>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8, const T9& e9) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type,
+      typename internal::DecayArray<T9>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5,
+      e6, e7, e8, e9));
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+inline internal::UnorderedElementsAreMatcher<
+    ::testing::tuple<
+        typename internal::DecayArray<T1>::type,
+        typename internal::DecayArray<T2>::type,
+        typename internal::DecayArray<T3>::type,
+        typename internal::DecayArray<T4>::type,
+        typename internal::DecayArray<T5>::type,
+        typename internal::DecayArray<T6>::type,
+        typename internal::DecayArray<T7>::type,
+        typename internal::DecayArray<T8>::type,
+        typename internal::DecayArray<T9>::type,
+        typename internal::DecayArray<T10>::type> >
+UnorderedElementsAre(const T1& e1, const T2& e2, const T3& e3, const T4& e4,
+    const T5& e5, const T6& e6, const T7& e7, const T8& e8, const T9& e9,
+    const T10& e10) {
+  typedef ::testing::tuple<
+      typename internal::DecayArray<T1>::type,
+      typename internal::DecayArray<T2>::type,
+      typename internal::DecayArray<T3>::type,
+      typename internal::DecayArray<T4>::type,
+      typename internal::DecayArray<T5>::type,
+      typename internal::DecayArray<T6>::type,
+      typename internal::DecayArray<T7>::type,
+      typename internal::DecayArray<T8>::type,
+      typename internal::DecayArray<T9>::type,
+      typename internal::DecayArray<T10>::type> Args;
+  return internal::UnorderedElementsAreMatcher<Args>(Args(e1, e2, e3, e4, e5,
+      e6, e7, e8, e9, e10));
+}
+
+// AllOf(m1, m2, ..., mk) matches any value that matches all of the given
+// sub-matchers.  AllOf is called fully qualified to prevent ADL from firing.
+
+template <typename M1, typename M2>
+inline typename internal::AllOfResult2<M1, M2>::type
+AllOf(M1 m1, M2 m2) {
+  return typename internal::AllOfResult2<M1, M2>::type(
+      m1,
+      m2);
+}
+
+template <typename M1, typename M2, typename M3>
+inline typename internal::AllOfResult3<M1, M2, M3>::type
+AllOf(M1 m1, M2 m2, M3 m3) {
+  return typename internal::AllOfResult3<M1, M2, M3>::type(
+      m1,
+      ::testing::AllOf(m2, m3));
+}
+
+template <typename M1, typename M2, typename M3, typename M4>
+inline typename internal::AllOfResult4<M1, M2, M3, M4>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4) {
+  return typename internal::AllOfResult4<M1, M2, M3, M4>::type(
+      ::testing::AllOf(m1, m2),
+      ::testing::AllOf(m3, m4));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5>
+inline typename internal::AllOfResult5<M1, M2, M3, M4, M5>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5) {
+  return typename internal::AllOfResult5<M1, M2, M3, M4, M5>::type(
+      ::testing::AllOf(m1, m2),
+      ::testing::AllOf(m3, m4, m5));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6>
+inline typename internal::AllOfResult6<M1, M2, M3, M4, M5, M6>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6) {
+  return typename internal::AllOfResult6<M1, M2, M3, M4, M5, M6>::type(
+      ::testing::AllOf(m1, m2, m3),
+      ::testing::AllOf(m4, m5, m6));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7>
+inline typename internal::AllOfResult7<M1, M2, M3, M4, M5, M6, M7>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7) {
+  return typename internal::AllOfResult7<M1, M2, M3, M4, M5, M6, M7>::type(
+      ::testing::AllOf(m1, m2, m3),
+      ::testing::AllOf(m4, m5, m6, m7));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8>
+inline typename internal::AllOfResult8<M1, M2, M3, M4, M5, M6, M7, M8>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8) {
+  return typename internal::AllOfResult8<M1, M2, M3, M4, M5, M6, M7, M8>::type(
+      ::testing::AllOf(m1, m2, m3, m4),
+      ::testing::AllOf(m5, m6, m7, m8));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9>
+inline typename internal::AllOfResult9<M1, M2, M3, M4, M5, M6, M7, M8, M9>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8, M9 m9) {
+  return typename internal::AllOfResult9<M1, M2, M3, M4, M5, M6, M7, M8,
+      M9>::type(
+      ::testing::AllOf(m1, m2, m3, m4),
+      ::testing::AllOf(m5, m6, m7, m8, m9));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9, typename M10>
+inline typename internal::AllOfResult10<M1, M2, M3, M4, M5, M6, M7, M8, M9,
+    M10>::type
+AllOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8, M9 m9, M10 m10) {
+  return typename internal::AllOfResult10<M1, M2, M3, M4, M5, M6, M7, M8, M9,
+      M10>::type(
+      ::testing::AllOf(m1, m2, m3, m4, m5),
+      ::testing::AllOf(m6, m7, m8, m9, m10));
+}
+
+// AnyOf(m1, m2, ..., mk) matches any value that matches any of the given
+// sub-matchers.  AnyOf is called fully qualified to prevent ADL from firing.
+
+template <typename M1, typename M2>
+inline typename internal::AnyOfResult2<M1, M2>::type
+AnyOf(M1 m1, M2 m2) {
+  return typename internal::AnyOfResult2<M1, M2>::type(
+      m1,
+      m2);
+}
+
+template <typename M1, typename M2, typename M3>
+inline typename internal::AnyOfResult3<M1, M2, M3>::type
+AnyOf(M1 m1, M2 m2, M3 m3) {
+  return typename internal::AnyOfResult3<M1, M2, M3>::type(
+      m1,
+      ::testing::AnyOf(m2, m3));
+}
+
+template <typename M1, typename M2, typename M3, typename M4>
+inline typename internal::AnyOfResult4<M1, M2, M3, M4>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4) {
+  return typename internal::AnyOfResult4<M1, M2, M3, M4>::type(
+      ::testing::AnyOf(m1, m2),
+      ::testing::AnyOf(m3, m4));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5>
+inline typename internal::AnyOfResult5<M1, M2, M3, M4, M5>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5) {
+  return typename internal::AnyOfResult5<M1, M2, M3, M4, M5>::type(
+      ::testing::AnyOf(m1, m2),
+      ::testing::AnyOf(m3, m4, m5));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6>
+inline typename internal::AnyOfResult6<M1, M2, M3, M4, M5, M6>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6) {
+  return typename internal::AnyOfResult6<M1, M2, M3, M4, M5, M6>::type(
+      ::testing::AnyOf(m1, m2, m3),
+      ::testing::AnyOf(m4, m5, m6));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7>
+inline typename internal::AnyOfResult7<M1, M2, M3, M4, M5, M6, M7>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7) {
+  return typename internal::AnyOfResult7<M1, M2, M3, M4, M5, M6, M7>::type(
+      ::testing::AnyOf(m1, m2, m3),
+      ::testing::AnyOf(m4, m5, m6, m7));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8>
+inline typename internal::AnyOfResult8<M1, M2, M3, M4, M5, M6, M7, M8>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8) {
+  return typename internal::AnyOfResult8<M1, M2, M3, M4, M5, M6, M7, M8>::type(
+      ::testing::AnyOf(m1, m2, m3, m4),
+      ::testing::AnyOf(m5, m6, m7, m8));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9>
+inline typename internal::AnyOfResult9<M1, M2, M3, M4, M5, M6, M7, M8, M9>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8, M9 m9) {
+  return typename internal::AnyOfResult9<M1, M2, M3, M4, M5, M6, M7, M8,
+      M9>::type(
+      ::testing::AnyOf(m1, m2, m3, m4),
+      ::testing::AnyOf(m5, m6, m7, m8, m9));
+}
+
+template <typename M1, typename M2, typename M3, typename M4, typename M5,
+    typename M6, typename M7, typename M8, typename M9, typename M10>
+inline typename internal::AnyOfResult10<M1, M2, M3, M4, M5, M6, M7, M8, M9,
+    M10>::type
+AnyOf(M1 m1, M2 m2, M3 m3, M4 m4, M5 m5, M6 m6, M7 m7, M8 m8, M9 m9, M10 m10) {
+  return typename internal::AnyOfResult10<M1, M2, M3, M4, M5, M6, M7, M8, M9,
+      M10>::type(
+      ::testing::AnyOf(m1, m2, m3, m4, m5),
+      ::testing::AnyOf(m6, m7, m8, m9, m10));
+}
+
+}  // namespace testing
+
+
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+//   MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds.  Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails.  Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects.  It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+//   // Expects mock_foo.Bar(n) to be called where n is even.
+//   EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+//   // Verifies that the value of some_expression is even.
+//   EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+//   Value of: some_expression
+//   Expected: is even
+//     Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you).  This allows the matcher to be
+// polymorphic.  For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool.  In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher.  For that you
+// can use another macro:
+//
+//   MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+//   Value of: Blah("a")
+//   Expected: has absolute value 10
+//     Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'.  For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P10 to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression.  The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'.  When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher.  For example,
+//
+//   using testing::PrintToString;
+//
+//   MATCHER_P2(InClosedRange, low, hi,
+//       string(negation ? "is not" : "is") + " in range [" +
+//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
+//     return low <= arg && arg <= hi;
+//   }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: is in range [4, 6]
+//   ...
+//   Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple.  For example,
+//
+//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: in closed range (4, 6)
+//   ...
+//   Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooMatcherPk<p1_type, ..., pk_type>
+//   Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you.  If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false).  As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used.  You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable.  If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded.  For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one.  To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+//   MATCHER_P(EqualsLongString, str, "") {
+//     if (arg == str) return true;
+//
+//     *result_listener << "the difference: "
+///                     << DiffStrings(str, arg);
+//     return false;
+//   }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+//   MATCHER_P(Blah, a, description_string1) { ... }
+//   MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher().  These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong.  They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope.  The reason is
+// that C++ doesn't yet allow function-local types to be used to
+// instantiate templates.  The up-coming C++0x standard will fix this.
+// Once that's done, we'll consider supporting using MATCHER*() inside
+// a function.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on http://code.google.com/p/googlemock/wiki/CookBook.
+
+#define MATCHER(name, description)\
+  class name##Matcher {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl()\
+           {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<>()));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>());\
+    }\
+    name##Matcher() {\
+    }\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##Matcher);\
+  };\
+  inline name##Matcher name() {\
+    return name##Matcher();\
+  }\
+  template <typename arg_type>\
+  bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P(name, p0, description)\
+  template <typename p0##_type>\
+  class name##MatcherP {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      explicit gmock_Impl(p0##_type gmock_p0)\
+           : p0(gmock_p0) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type>(p0)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0));\
+    }\
+    explicit name##MatcherP(p0##_type gmock_p0) : p0(gmock_p0) {\
+    }\
+    p0##_type p0;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP);\
+  };\
+  template <typename p0##_type>\
+  inline name##MatcherP<p0##_type> name(p0##_type p0) {\
+    return name##MatcherP<p0##_type>(p0);\
+  }\
+  template <typename p0##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP<p0##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P2(name, p0, p1, description)\
+  template <typename p0##_type, typename p1##_type>\
+  class name##MatcherP2 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1)\
+           : p0(gmock_p0), p1(gmock_p1) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type>(p0, p1)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1));\
+    }\
+    name##MatcherP2(p0##_type gmock_p0, p1##_type gmock_p1) : p0(gmock_p0), \
+        p1(gmock_p1) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP2);\
+  };\
+  template <typename p0##_type, typename p1##_type>\
+  inline name##MatcherP2<p0##_type, p1##_type> name(p0##_type p0, \
+      p1##_type p1) {\
+    return name##MatcherP2<p0##_type, p1##_type>(p0, p1);\
+  }\
+  template <typename p0##_type, typename p1##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP2<p0##_type, \
+      p1##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P3(name, p0, p1, p2, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  class name##MatcherP3 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type>(p0, p1, \
+                    p2)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2));\
+    }\
+    name##MatcherP3(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP3);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  inline name##MatcherP3<p0##_type, p1##_type, p2##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2) {\
+    return name##MatcherP3<p0##_type, p1##_type, p2##_type>(p0, p1, p2);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP3<p0##_type, p1##_type, \
+      p2##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P4(name, p0, p1, p2, p3, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  class name##MatcherP4 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, \
+                    p3##_type>(p0, p1, p2, p3)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3));\
+    }\
+    name##MatcherP4(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP4);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  inline name##MatcherP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3) {\
+    return name##MatcherP4<p0##_type, p1##_type, p2##_type, p3##_type>(p0, \
+        p1, p2, p3);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP4<p0##_type, p1##_type, p2##_type, \
+      p3##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P5(name, p0, p1, p2, p3, p4, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  class name##MatcherP5 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type>(p0, p1, p2, p3, p4)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4));\
+    }\
+    name##MatcherP5(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, \
+        p4##_type gmock_p4) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP5);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  inline name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4) {\
+    return name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type>(p0, p1, p2, p3, p4);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP5<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  class name##MatcherP6 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4), p5(gmock_p5) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5));\
+    }\
+    name##MatcherP6(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP6);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  inline name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, \
+      p3##_type p3, p4##_type p4, p5##_type p5) {\
+    return name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type>(p0, p1, p2, p3, p4, p5);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP6<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  class name##MatcherP7 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4), p5(gmock_p5), p6(gmock_p6) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, \
+                    p6)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6));\
+    }\
+    name##MatcherP7(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), \
+        p6(gmock_p6) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP7);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  inline name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type> name(p0##_type p0, p1##_type p1, \
+      p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6) {\
+    return name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type>(p0, p1, p2, p3, p4, p5, p6);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP7<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  class name##MatcherP8 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, \
+                    p3, p4, p5, p6, p7)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7));\
+    }\
+    name##MatcherP8(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, \
+        p7##_type gmock_p7) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+        p7(gmock_p7) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP8);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  inline name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type> name(p0##_type p0, \
+      p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, p5##_type p5, \
+      p6##_type p6, p7##_type p7) {\
+    return name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type>(p0, p1, p2, p3, p4, p5, \
+        p6, p7);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP8<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type, \
+      p7##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  class name##MatcherP9 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+               p8(gmock_p8) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type, \
+                    p8##_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8));\
+    }\
+    name##MatcherP9(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), \
+        p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+        p8(gmock_p8) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP9);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  inline name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, \
+      p8##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, \
+      p8##_type p8) {\
+    return name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type>(p0, p1, p2, \
+        p3, p4, p5, p6, p7, p8);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP9<p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+      p5##_type, p6##_type, p7##_type, \
+      p8##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description)\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  class name##MatcherP10 {\
+   public:\
+    template <typename arg_type>\
+    class gmock_Impl : public ::testing::MatcherInterface<arg_type> {\
+     public:\
+      gmock_Impl(p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+          p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+          p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+          p9##_type gmock_p9)\
+           : p0(gmock_p0), p1(gmock_p1), p2(gmock_p2), p3(gmock_p3), \
+               p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), p7(gmock_p7), \
+               p8(gmock_p8), p9(gmock_p9) {}\
+      virtual bool MatchAndExplain(\
+          arg_type arg, ::testing::MatchResultListener* result_listener) const;\
+      virtual void DescribeTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(false);\
+      }\
+      virtual void DescribeNegationTo(::std::ostream* gmock_os) const {\
+        *gmock_os << FormatDescription(true);\
+      }\
+      p0##_type p0;\
+      p1##_type p1;\
+      p2##_type p2;\
+      p3##_type p3;\
+      p4##_type p4;\
+      p5##_type p5;\
+      p6##_type p6;\
+      p7##_type p7;\
+      p8##_type p8;\
+      p9##_type p9;\
+     private:\
+      ::testing::internal::string FormatDescription(bool negation) const {\
+        const ::testing::internal::string gmock_description = (description);\
+        if (!gmock_description.empty())\
+          return gmock_description;\
+        return ::testing::internal::FormatMatcherDescription(\
+            negation, #name, \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(\
+                ::testing::tuple<p0##_type, p1##_type, p2##_type, p3##_type, \
+                    p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+                    p9##_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)));\
+      }\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename arg_type>\
+    operator ::testing::Matcher<arg_type>() const {\
+      return ::testing::Matcher<arg_type>(\
+          new gmock_Impl<arg_type>(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9));\
+    }\
+    name##MatcherP10(p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2, p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5, p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8, p9##_type gmock_p9) : p0(gmock_p0), p1(gmock_p1), \
+        p2(gmock_p2), p3(gmock_p3), p4(gmock_p4), p5(gmock_p5), p6(gmock_p6), \
+        p7(gmock_p7), p8(gmock_p8), p9(gmock_p9) {\
+    }\
+    p0##_type p0;\
+    p1##_type p1;\
+    p2##_type p2;\
+    p3##_type p3;\
+    p4##_type p4;\
+    p5##_type p5;\
+    p6##_type p6;\
+    p7##_type p7;\
+    p8##_type p8;\
+    p9##_type p9;\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(name##MatcherP10);\
+  };\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  inline name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type> name(p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+      p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+      p9##_type p9) {\
+    return name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+        p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, p9##_type>(p0, \
+        p1, p2, p3, p4, p5, p6, p7, p8, p9);\
+  }\
+  template <typename p0##_type, typename p1##_type, typename p2##_type, \
+      typename p3##_type, typename p4##_type, typename p5##_type, \
+      typename p6##_type, typename p7##_type, typename p8##_type, \
+      typename p9##_type>\
+  template <typename arg_type>\
+  bool name##MatcherP10<p0##_type, p1##_type, p2##_type, p3##_type, \
+      p4##_type, p5##_type, p6##_type, p7##_type, p8##_type, \
+      p9##_type>::gmock_Impl<arg_type>::MatchAndExplain(\
+      arg_type arg, \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_)\
+          const
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_MATCHERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-generated-nice-strict.h b/utils/unittest/googlemock/include/gmock/gmock-generated-nice-strict.h
new file mode 100644
index 000000000000..4095f4d5bc7f
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-generated-nice-strict.h
@@ -0,0 +1,397 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-nice-strict.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements class templates NiceMock, NaggyMock, and StrictMock.
+//
+// Given a mock class MockFoo that is created using Google Mock,
+// NiceMock<MockFoo> is a subclass of MockFoo that allows
+// uninteresting calls (i.e. calls to mock methods that have no
+// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
+// that prints a warning when an uninteresting call occurs, and
+// StrictMock<MockFoo> is a subclass of MockFoo that treats all
+// uninteresting calls as errors.
+//
+// Currently a mock is naggy by default, so MockFoo and
+// NaggyMock<MockFoo> behave like the same.  However, we will soon
+// switch the default behavior of mocks to be nice, as that in general
+// leads to more maintainable tests.  When that happens, MockFoo will
+// stop behaving like NaggyMock<MockFoo> and start behaving like
+// NiceMock<MockFoo>.
+//
+// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
+// their respective base class, with up-to 10 arguments.  Therefore
+// you can write NiceMock<MockFoo>(5, "a") to construct a nice mock
+// where MockFoo has a constructor that accepts (int, const char*),
+// for example.
+//
+// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
+// and StrictMock<MockFoo> only works for mock methods defined using
+// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
+// If a mock method is defined in a base class of MockFoo, the "nice"
+// or "strict" modifier may not affect it, depending on the compiler.
+// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
+// supported.
+//
+// Another known limitation is that the constructors of the base mock
+// cannot have arguments passed by non-const reference, which are
+// banned by the Google C++ style guide anyway.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_NICE_STRICT_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_NICE_STRICT_H_
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+template <class MockClass>
+class NiceMock : public MockClass {
+ public:
+  // We don't factor out the constructor body to a common method, as
+  // we have to avoid a possible clash with members of MockClass.
+  NiceMock() {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // C++ doesn't (yet) allow inheritance of constructors, so we have
+  // to define it for each arity.
+  template <typename A1>
+  explicit NiceMock(const A1& a1) : MockClass(a1) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+  template <typename A1, typename A2>
+  NiceMock(const A1& a1, const A2& a2) : MockClass(a1, a2) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3) : MockClass(a1, a2, a3) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3,
+      const A4& a4) : MockClass(a1, a2, a3, a4) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5) : MockClass(a1, a2, a3, a4, a5) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6) : MockClass(a1, a2, a3, a4, a5, a6) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7) : MockClass(a1, a2, a3, a4, a5,
+      a6, a7) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8) : MockClass(a1,
+      a2, a3, a4, a5, a6, a7, a8) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8,
+      const A9& a9) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9, typename A10>
+  NiceMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8, const A9& a9,
+      const A10& a10) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  virtual ~NiceMock() {
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NiceMock);
+};
+
+template <class MockClass>
+class NaggyMock : public MockClass {
+ public:
+  // We don't factor out the constructor body to a common method, as
+  // we have to avoid a possible clash with members of MockClass.
+  NaggyMock() {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // C++ doesn't (yet) allow inheritance of constructors, so we have
+  // to define it for each arity.
+  template <typename A1>
+  explicit NaggyMock(const A1& a1) : MockClass(a1) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+  template <typename A1, typename A2>
+  NaggyMock(const A1& a1, const A2& a2) : MockClass(a1, a2) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3) : MockClass(a1, a2, a3) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3,
+      const A4& a4) : MockClass(a1, a2, a3, a4) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5) : MockClass(a1, a2, a3, a4, a5) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6) : MockClass(a1, a2, a3, a4, a5, a6) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7) : MockClass(a1, a2, a3, a4, a5,
+      a6, a7) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8) : MockClass(a1,
+      a2, a3, a4, a5, a6, a7, a8) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8,
+      const A9& a9) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9, typename A10>
+  NaggyMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8, const A9& a9,
+      const A10& a10) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  virtual ~NaggyMock() {
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NaggyMock);
+};
+
+template <class MockClass>
+class StrictMock : public MockClass {
+ public:
+  // We don't factor out the constructor body to a common method, as
+  // we have to avoid a possible clash with members of MockClass.
+  StrictMock() {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // C++ doesn't (yet) allow inheritance of constructors, so we have
+  // to define it for each arity.
+  template <typename A1>
+  explicit StrictMock(const A1& a1) : MockClass(a1) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+  template <typename A1, typename A2>
+  StrictMock(const A1& a1, const A2& a2) : MockClass(a1, a2) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3) : MockClass(a1, a2, a3) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3,
+      const A4& a4) : MockClass(a1, a2, a3, a4) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5) : MockClass(a1, a2, a3, a4, a5) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6) : MockClass(a1, a2, a3, a4, a5, a6) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7) : MockClass(a1, a2, a3, a4, a5,
+      a6, a7) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8) : MockClass(a1,
+      a2, a3, a4, a5, a6, a7, a8) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8,
+      const A9& a9) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename A3, typename A4, typename A5,
+      typename A6, typename A7, typename A8, typename A9, typename A10>
+  StrictMock(const A1& a1, const A2& a2, const A3& a3, const A4& a4,
+      const A5& a5, const A6& a6, const A7& a7, const A8& a8, const A9& a9,
+      const A10& a10) : MockClass(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  virtual ~StrictMock() {
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StrictMock);
+};
+
+// The following specializations catch some (relatively more common)
+// user errors of nesting nice and strict mocks.  They do NOT catch
+// all possible errors.
+
+// These specializations are declared but not defined, as NiceMock,
+// NaggyMock, and StrictMock cannot be nested.
+
+template <typename MockClass>
+class NiceMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class NaggyMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class StrictMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<StrictMock<MockClass> >;
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_NICE_STRICT_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-matchers.h b/utils/unittest/googlemock/include/gmock/gmock-matchers.h
new file mode 100644
index 000000000000..749a30e4e6d8
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-matchers.h
@@ -0,0 +1,4415 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used argument matchers.  More
+// matchers can be defined by the user implementing the
+// MatcherInterface<T> interface if necessary.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+
+#include <math.h>
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_STD_INITIALIZER_LIST_
+# include <initializer_list>  // NOLINT -- must be after gtest.h
+#endif
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// MatchResultListener is an abstract class.  Its << operator can be
+// used by a matcher to explain why a value matches or doesn't match.
+//
+// TODO(wan@google.com): add method
+//   bool InterestedInWhy(bool result) const;
+// to indicate whether the listener is interested in why the match
+// result is 'result'.
+class MatchResultListener {
+ public:
+  // Creates a listener object with the given underlying ostream.  The
+  // listener does not own the ostream, and does not dereference it
+  // in the constructor or destructor.
+  explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
+  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
+
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener& operator<<(const T& x) {
+    if (stream_ != NULL)
+      *stream_ << x;
+    return *this;
+  }
+
+  // Returns the underlying ostream.
+  ::std::ostream* stream() { return stream_; }
+
+  // Returns true iff the listener is interested in an explanation of
+  // the match result.  A matcher's MatchAndExplain() method can use
+  // this information to avoid generating the explanation when no one
+  // intends to hear it.
+  bool IsInterested() const { return stream_ != NULL; }
+
+ private:
+  ::std::ostream* const stream_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+};
+
+inline MatchResultListener::~MatchResultListener() {
+}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class MatcherDescriberInterface {
+ public:
+  virtual ~MatcherDescriberInterface() {}
+
+  // Describes this matcher to an ostream.  The function should print
+  // a verb phrase that describes the property a value matching this
+  // matcher should have.  The subject of the verb phrase is the value
+  // being matched.  For example, the DescribeTo() method of the Gt(7)
+  // matcher prints "is greater than 7".
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.  For
+  // example, if the description of this matcher is "is greater than
+  // 7", the negated description could be "is not greater than 7".
+  // You are not required to override this when implementing
+  // MatcherInterface, but it is highly advised so that your matcher
+  // can produce good error messages.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not (";
+    DescribeTo(os);
+    *os << ")";
+  }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+  // Returns true iff the matcher matches x; also explains the match
+  // result to 'listener' if necessary (see the next paragraph), in
+  // the form of a non-restrictive relative clause ("which ...",
+  // "whose ...", etc) that describes x.  For example, the
+  // MatchAndExplain() method of the Pointee(...) matcher should
+  // generate an explanation like "which points to ...".
+  //
+  // Implementations of MatchAndExplain() should add an explanation of
+  // the match result *if and only if* they can provide additional
+  // information that's not already present (or not obvious) in the
+  // print-out of x and the matcher's description.  Whether the match
+  // succeeds is not a factor in deciding whether an explanation is
+  // needed, as sometimes the caller needs to print a failure message
+  // when the match succeeds (e.g. when the matcher is used inside
+  // Not()).
+  //
+  // For example, a "has at least 10 elements" matcher should explain
+  // what the actual element count is, regardless of the match result,
+  // as it is useful information to the reader; on the other hand, an
+  // "is empty" matcher probably only needs to explain what the actual
+  // size is when the match fails, as it's redundant to say that the
+  // size is 0 when the value is already known to be empty.
+  //
+  // You should override this method when defining a new matcher.
+  //
+  // It's the responsibility of the caller (Google Mock) to guarantee
+  // that 'listener' is not NULL.  This helps to simplify a matcher's
+  // implementation when it doesn't care about the performance, as it
+  // can talk to 'listener' without checking its validity first.
+  // However, in order to implement dummy listeners efficiently,
+  // listener->stream() may be NULL.
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+  // Inherits these methods from MatcherDescriberInterface:
+  //   virtual void DescribeTo(::std::ostream* os) const = 0;
+  //   virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+// A match result listener that stores the explanation in a string.
+class StringMatchResultListener : public MatchResultListener {
+ public:
+  StringMatchResultListener() : MatchResultListener(&ss_) {}
+
+  // Returns the explanation accumulated so far.
+  internal::string str() const { return ss_.str(); }
+
+  // Clears the explanation accumulated so far.
+  void Clear() { ss_.str(""); }
+
+ private:
+  ::std::stringstream ss_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StringMatchResultListener);
+};
+
+namespace internal {
+
+struct AnyEq {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a == b; }
+};
+struct AnyNe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a != b; }
+};
+struct AnyLt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a < b; }
+};
+struct AnyGt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a > b; }
+};
+struct AnyLe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a <= b; }
+};
+struct AnyGe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const { return a >= b; }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+  DummyMatchResultListener() : MatchResultListener(NULL) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream.  The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+  explicit StreamMatchResultListener(::std::ostream* os)
+      : MatchResultListener(os) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it.  We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase {
+ public:
+  // Returns true iff the matcher matches x; also explains the match
+  // result to 'listener'.
+  bool MatchAndExplain(T x, MatchResultListener* listener) const {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+  // Returns true iff this matcher matches x.
+  bool Matches(T x) const {
+    DummyMatchResultListener dummy;
+    return MatchAndExplain(x, &dummy);
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(::std::ostream* os) const {
+    impl_->DescribeNegationTo(os);
+  }
+
+  // Explains why x matches, or doesn't match, the matcher.
+  void ExplainMatchResultTo(T x, ::std::ostream* os) const {
+    StreamMatchResultListener listener(os);
+    MatchAndExplain(x, &listener);
+  }
+
+  // Returns the describer for this matcher object; retains ownership
+  // of the describer, which is only guaranteed to be alive when
+  // this matcher object is alive.
+  const MatcherDescriberInterface* GetDescriber() const {
+    return impl_.get();
+  }
+
+ protected:
+  MatcherBase() {}
+
+  // Constructs a matcher from its implementation.
+  explicit MatcherBase(const MatcherInterface<T>* impl)
+      : impl_(impl) {}
+
+  virtual ~MatcherBase() {}
+
+ private:
+  // shared_ptr (util/gtl/shared_ptr.h) and linked_ptr have similar
+  // interfaces.  The former dynamically allocates a chunk of memory
+  // to hold the reference count, while the latter tracks all
+  // references using a circular linked list without allocating
+  // memory.  It has been observed that linked_ptr performs better in
+  // typical scenarios.  However, shared_ptr can out-perform
+  // linked_ptr when there are many more uses of the copy constructor
+  // than the default constructor.
+  //
+  // If performance becomes a problem, we should see if using
+  // shared_ptr helps.
+  ::testing::internal::linked_ptr<const MatcherInterface<T> > impl_;
+};
+
+}  // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches.  The
+// implementation of Matcher<T> is just a linked_ptr to const
+// MatcherInterface<T>, so copying is fairly cheap.  Don't inherit
+// from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+  // Constructs a null matcher.  Needed for storing Matcher objects in STL
+  // containers.  A default-constructed matcher is not yet initialized.  You
+  // cannot use it until a valid value has been assigned to it.
+  explicit Matcher() {}  // NOLINT
+
+  // Constructs a matcher from its implementation.
+  explicit Matcher(const MatcherInterface<T>* impl)
+      : internal::MatcherBase<T>(impl) {}
+
+  // Implicit constructor here allows people to write
+  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+  Matcher(T value);  // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const internal::string&>
+    : public internal::MatcherBase<const internal::string&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::string&>* impl)
+      : internal::MatcherBase<const internal::string&>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const internal::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<internal::string>
+    : public internal::MatcherBase<internal::string> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<internal::string>* impl)
+      : internal::MatcherBase<internal::string>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const internal::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+#if GTEST_HAS_STRING_PIECE_
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a StringPiece
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const StringPiece&>
+    : public internal::MatcherBase<const StringPiece&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const StringPiece&>* impl)
+      : internal::MatcherBase<const StringPiece&>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const internal::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass StringPieces directly.
+  Matcher(StringPiece s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<StringPiece>
+    : public internal::MatcherBase<StringPiece> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<StringPiece>* impl)
+      : internal::MatcherBase<StringPiece>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const internal::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass StringPieces directly.
+  Matcher(StringPiece s);  // NOLINT
+};
+#endif  // GTEST_HAS_STRING_PIECE_
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+//   bool MatchAndExplain(const Value& value,
+//                        MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+  explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
+
+  // Returns a mutable reference to the underlying matcher
+  // implementation object.
+  Impl& mutable_impl() { return impl_; }
+
+  // Returns an immutable reference to the underlying matcher
+  // implementation object.
+  const Impl& impl() const { return impl_; }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new MonomorphicImpl<T>(impl_));
+  }
+
+ private:
+  template <typename T>
+  class MonomorphicImpl : public MatcherInterface<T> {
+   public:
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      impl_.DescribeTo(os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      impl_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+      return impl_.MatchAndExplain(x, listener);
+    }
+
+   private:
+    const Impl impl_;
+
+    GTEST_DISALLOW_ASSIGN_(MonomorphicImpl);
+  };
+
+  Impl impl_;
+
+  GTEST_DISALLOW_ASSIGN_(PolymorphicMatcher);
+};
+
+// Creates a matcher from its implementation.  This is easier to use
+// than the Matcher<T> constructor as it doesn't require you to
+// explicitly write the template argument, e.g.
+//
+//   MakeMatcher(foo);
+// vs
+//   Matcher<const string&>(foo);
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
+  return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation.  This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicMatcher(foo);
+// vs
+//   PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
+  return PolymorphicMatcher<Impl>(impl);
+}
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// The MatcherCastImpl class template is a helper for implementing
+// MatcherCast().  We need this helper in order to partially
+// specialize the implementation of MatcherCast() (C++ allows
+// class/struct templates to be partially specialized, but not
+// function templates.).
+
+// This general version is used when MatcherCast()'s argument is a
+// polymorphic matcher (i.e. something that can be converted to a
+// Matcher but is not one yet; for example, Eq(value)) or a value (for
+// example, "hello").
+template <typename T, typename M>
+class MatcherCastImpl {
+ public:
+  static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    // M can be a polymorhic matcher, in which case we want to use
+    // its conversion operator to create Matcher<T>.  Or it can be a value
+    // that should be passed to the Matcher<T>'s constructor.
+    //
+    // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
+    // polymorphic matcher because it'll be ambiguous if T has an implicit
+    // constructor from M (this usually happens when T has an implicit
+    // constructor from any type).
+    //
+    // It won't work to unconditionally implict_cast
+    // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
+    // a user-defined conversion from M to T if one exists (assuming M is
+    // a value).
+    return CastImpl(
+        polymorphic_matcher_or_value,
+        BooleanConstant<
+            internal::ImplicitlyConvertible<M, Matcher<T> >::value>());
+  }
+
+ private:
+  static Matcher<T> CastImpl(const M& value, BooleanConstant<false>) {
+    // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
+    // matcher.  It must be a value then.  Use direct initialization to create
+    // a matcher.
+    return Matcher<T>(ImplicitCast_<T>(value));
+  }
+
+  static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
+                             BooleanConstant<true>) {
+    // M is implicitly convertible to Matcher<T>, which means that either
+    // M is a polymorhpic matcher or Matcher<T> has an implicit constructor
+    // from M.  In both cases using the implicit conversion will produce a
+    // matcher.
+    //
+    // Even if T has an implicit constructor from M, it won't be called because
+    // creating Matcher<T> would require a chain of two user-defined conversions
+    // (first to create T from M and then to create Matcher<T> from T).
+    return polymorphic_matcher_or_value;
+  }
+};
+
+// This more specialized version is used when MatcherCast()'s argument
+// is already a Matcher.  This only compiles when type T can be
+// statically converted to type U.
+template <typename T, typename U>
+class MatcherCastImpl<T, Matcher<U> > {
+ public:
+  static Matcher<T> Cast(const Matcher<U>& source_matcher) {
+    return Matcher<T>(new Impl(source_matcher));
+  }
+
+ private:
+  class Impl : public MatcherInterface<T> {
+   public:
+    explicit Impl(const Matcher<U>& source_matcher)
+        : source_matcher_(source_matcher) {}
+
+    // We delegate the matching logic to the source matcher.
+    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+      return source_matcher_.MatchAndExplain(static_cast<U>(x), listener);
+    }
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      source_matcher_.DescribeTo(os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      source_matcher_.DescribeNegationTo(os);
+    }
+
+   private:
+    const Matcher<U> source_matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+};
+
+// This even more specialized version is used for efficiently casting
+// a matcher to its own type.
+template <typename T>
+class MatcherCastImpl<T, Matcher<T> > {
+ public:
+  static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
+};
+
+}  // namespace internal
+
+// In order to be safe and clear, casting between different matcher
+// types is done explicitly via MatcherCast<T>(m), which takes a
+// matcher m and returns a Matcher<T>.  It compiles only when T can be
+// statically converted to the argument type of m.
+template <typename T, typename M>
+inline Matcher<T> MatcherCast(const M& matcher) {
+  return internal::MatcherCastImpl<T, M>::Cast(matcher);
+}
+
+// Implements SafeMatcherCast().
+//
+// We use an intermediate class to do the actual safe casting as Nokia's
+// Symbian compiler cannot decide between
+// template <T, M> ... (M) and
+// template <T, U> ... (const Matcher<U>&)
+// for function templates but can for member function templates.
+template <typename T>
+class SafeMatcherCastImpl {
+ public:
+  // This overload handles polymorphic matchers and values only since
+  // monomorphic matchers are handled by the next one.
+  template <typename M>
+  static inline Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    return internal::MatcherCastImpl<T, M>::Cast(polymorphic_matcher_or_value);
+  }
+
+  // This overload handles monomorphic matchers.
+  //
+  // In general, if type T can be implicitly converted to type U, we can
+  // safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
+  // contravariant): just keep a copy of the original Matcher<U>, convert the
+  // argument from type T to U, and then pass it to the underlying Matcher<U>.
+  // The only exception is when U is a reference and T is not, as the
+  // underlying Matcher<U> may be interested in the argument's address, which
+  // is not preserved in the conversion from T to U.
+  template <typename U>
+  static inline Matcher<T> Cast(const Matcher<U>& matcher) {
+    // Enforce that T can be implicitly converted to U.
+    GTEST_COMPILE_ASSERT_((internal::ImplicitlyConvertible<T, U>::value),
+                          T_must_be_implicitly_convertible_to_U);
+    // Enforce that we are not converting a non-reference type T to a reference
+    // type U.
+    GTEST_COMPILE_ASSERT_(
+        internal::is_reference<T>::value || !internal::is_reference<U>::value,
+        cannot_convert_non_referentce_arg_to_reference);
+    // In case both T and U are arithmetic types, enforce that the
+    // conversion is not lossy.
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
+    const bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
+    const bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
+    GTEST_COMPILE_ASSERT_(
+        kTIsOther || kUIsOther ||
+        (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+        conversion_of_arithmetic_types_must_be_lossless);
+    return MatcherCast<T>(matcher);
+  }
+};
+
+template <typename T, typename M>
+inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher) {
+  return SafeMatcherCastImpl<T>::Cast(polymorphic_matcher);
+}
+
+// A<T>() returns a matcher that matches any value of type T.
+template <typename T>
+Matcher<T> A();
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// If the explanation is not empty, prints it to the ostream.
+inline void PrintIfNotEmpty(const internal::string& explanation,
+                            ::std::ostream* os) {
+  if (explanation != "" && os != NULL) {
+    *os << ", " << explanation;
+  }
+}
+
+// Returns true if the given type name is easy to read by a human.
+// This is used to decide whether printing the type of a value might
+// be helpful.
+inline bool IsReadableTypeName(const string& type_name) {
+  // We consider a type name readable if it's short or doesn't contain
+  // a template or function type.
+  return (type_name.length() <= 20 ||
+          type_name.find_first_of("<(") == string::npos);
+}
+
+// Matches the value against the given matcher, prints the value and explains
+// the match result to the listener. Returns the match result.
+// 'listener' must not be NULL.
+// Value cannot be passed by const reference, because some matchers take a
+// non-const argument.
+template <typename Value, typename T>
+bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
+                          MatchResultListener* listener) {
+  if (!listener->IsInterested()) {
+    // If the listener is not interested, we do not need to construct the
+    // inner explanation.
+    return matcher.Matches(value);
+  }
+
+  StringMatchResultListener inner_listener;
+  const bool match = matcher.MatchAndExplain(value, &inner_listener);
+
+  UniversalPrint(value, listener->stream());
+#if GTEST_HAS_RTTI
+  const string& type_name = GetTypeName<Value>();
+  if (IsReadableTypeName(type_name))
+    *listener->stream() << " (of type " << type_name << ")";
+#endif
+  PrintIfNotEmpty(inner_listener.str(), listener->stream());
+
+  return match;
+}
+
+// An internal helper class for doing compile-time loop on a tuple's
+// fields.
+template <size_t N>
+class TuplePrefix {
+ public:
+  // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
+  // iff the first N fields of matcher_tuple matches the first N
+  // fields of value_tuple, respectively.
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& matcher_tuple,
+                      const ValueTuple& value_tuple) {
+    return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple)
+        && get<N - 1>(matcher_tuple).Matches(get<N - 1>(value_tuple));
+  }
+
+  // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
+  // describes failures in matching the first N fields of matchers
+  // against the first N fields of values.  If there is no failure,
+  // nothing will be streamed to os.
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
+                                     const ValueTuple& values,
+                                     ::std::ostream* os) {
+    // First, describes failures in the first N - 1 fields.
+    TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
+
+    // Then describes the failure (if any) in the (N - 1)-th (0-based)
+    // field.
+    typename tuple_element<N - 1, MatcherTuple>::type matcher =
+        get<N - 1>(matchers);
+    typedef typename tuple_element<N - 1, ValueTuple>::type Value;
+    Value value = get<N - 1>(values);
+    StringMatchResultListener listener;
+    if (!matcher.MatchAndExplain(value, &listener)) {
+      // TODO(wan): include in the message the name of the parameter
+      // as used in MOCK_METHOD*() when possible.
+      *os << "  Expected arg #" << N - 1 << ": ";
+      get<N - 1>(matchers).DescribeTo(os);
+      *os << "\n           Actual: ";
+      // We remove the reference in type Value to prevent the
+      // universal printer from printing the address of value, which
+      // isn't interesting to the user most of the time.  The
+      // matcher's MatchAndExplain() method handles the case when
+      // the address is interesting.
+      internal::UniversalPrint(value, os);
+      PrintIfNotEmpty(listener.str(), os);
+      *os << "\n";
+    }
+  }
+};
+
+// The base case.
+template <>
+class TuplePrefix<0> {
+ public:
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& /* matcher_tuple */,
+                      const ValueTuple& /* value_tuple */) {
+    return true;
+  }
+
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
+                                     const ValueTuple& /* values */,
+                                     ::std::ostream* /* os */) {}
+};
+
+// TupleMatches(matcher_tuple, value_tuple) returns true iff all
+// matchers in matcher_tuple match the corresponding fields in
+// value_tuple.  It is a compiler error if matcher_tuple and
+// value_tuple have different number of fields or incompatible field
+// types.
+template <typename MatcherTuple, typename ValueTuple>
+bool TupleMatches(const MatcherTuple& matcher_tuple,
+                  const ValueTuple& value_tuple) {
+  // Makes sure that matcher_tuple and value_tuple have the same
+  // number of fields.
+  GTEST_COMPILE_ASSERT_(tuple_size<MatcherTuple>::value ==
+                        tuple_size<ValueTuple>::value,
+                        matcher_and_value_have_different_numbers_of_fields);
+  return TuplePrefix<tuple_size<ValueTuple>::value>::
+      Matches(matcher_tuple, value_tuple);
+}
+
+// Describes failures in matching matchers against values.  If there
+// is no failure, nothing will be streamed to os.
+template <typename MatcherTuple, typename ValueTuple>
+void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
+                                const ValueTuple& values,
+                                ::std::ostream* os) {
+  TuplePrefix<tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
+      matchers, values, os);
+}
+
+// TransformTupleValues and its helper.
+//
+// TransformTupleValuesHelper hides the internal machinery that
+// TransformTupleValues uses to implement a tuple traversal.
+template <typename Tuple, typename Func, typename OutIter>
+class TransformTupleValuesHelper {
+ private:
+  typedef ::testing::tuple_size<Tuple> TupleSize;
+
+ public:
+  // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
+  // Returns the final value of 'out' in case the caller needs it.
+  static OutIter Run(Func f, const Tuple& t, OutIter out) {
+    return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
+  }
+
+ private:
+  template <typename Tup, size_t kRemainingSize>
+  struct IterateOverTuple {
+    OutIter operator() (Func f, const Tup& t, OutIter out) const {
+      *out++ = f(::testing::get<TupleSize::value - kRemainingSize>(t));
+      return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
+    }
+  };
+  template <typename Tup>
+  struct IterateOverTuple<Tup, 0> {
+    OutIter operator() (Func /* f */, const Tup& /* t */, OutIter out) const {
+      return out;
+    }
+  };
+};
+
+// Successively invokes 'f(element)' on each element of the tuple 't',
+// appending each result to the 'out' iterator. Returns the final value
+// of 'out'.
+template <typename Tuple, typename Func, typename OutIter>
+OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
+  return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
+}
+
+// Implements A<T>().
+template <typename T>
+class AnyMatcherImpl : public MatcherInterface<T> {
+ public:
+  virtual bool MatchAndExplain(
+      T /* x */, MatchResultListener* /* listener */) const { return true; }
+  virtual void DescribeTo(::std::ostream* os) const { *os << "is anything"; }
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    // This is mostly for completeness' safe, as it's not very useful
+    // to write Not(A<bool>()).  However we cannot completely rule out
+    // such a possibility, and it doesn't hurt to be prepared.
+    *os << "never matches";
+  }
+};
+
+// Implements _, a matcher that matches any value of any
+// type.  This is a polymorphic matcher, so we need a template type
+// conversion operator to make it appearing as a Matcher<T> for any
+// type T.
+class AnythingMatcher {
+ public:
+  template <typename T>
+  operator Matcher<T>() const { return A<T>(); }
+};
+
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators.  The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc).  Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+  explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+  template <typename Lhs>
+  operator Matcher<Lhs>() const {
+    return MakeMatcher(new Impl<Lhs>(rhs_));
+  }
+
+ private:
+  template <typename Lhs>
+  class Impl : public MatcherInterface<Lhs> {
+   public:
+    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
+    virtual bool MatchAndExplain(
+        Lhs lhs, MatchResultListener* /* listener */) const {
+      return Op()(lhs, rhs_);
+    }
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << D::Desc() << " ";
+      UniversalPrint(rhs_, os);
+    }
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << D::NegatedDesc() <<  " ";
+      UniversalPrint(rhs_, os);
+    }
+   private:
+    Rhs rhs_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+  Rhs rhs_;
+  GTEST_DISALLOW_ASSIGN_(ComparisonBase);
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+  explicit EqMatcher(const Rhs& rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+  static const char* Desc() { return "is equal to"; }
+  static const char* NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+  explicit NeMatcher(const Rhs& rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+  static const char* Desc() { return "isn't equal to"; }
+  static const char* NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+  explicit LtMatcher(const Rhs& rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+  static const char* Desc() { return "is <"; }
+  static const char* NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+  explicit GtMatcher(const Rhs& rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+  static const char* Desc() { return "is >"; }
+  static const char* NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+  explicit LeMatcher(const Rhs& rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+  static const char* Desc() { return "is <="; }
+  static const char* NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+  explicit GeMatcher(const Rhs& rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+  static const char* Desc() { return "is >="; }
+  static const char* NegatedDesc() { return "isn't >="; }
+};
+
+// Implements the polymorphic IsNull() matcher, which matches any raw or smart
+// pointer that is NULL.
+class IsNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+#if GTEST_LANG_CXX11
+    return p == nullptr;
+#else  // GTEST_LANG_CXX11
+    return GetRawPointer(p) == NULL;
+#endif  // GTEST_LANG_CXX11
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NULL";
+  }
+};
+
+// Implements the polymorphic NotNull() matcher, which matches any raw or smart
+// pointer that is not NULL.
+class NotNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+#if GTEST_LANG_CXX11
+    return p != nullptr;
+#else  // GTEST_LANG_CXX11
+    return GetRawPointer(p) != NULL;
+#endif  // GTEST_LANG_CXX11
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is NULL";
+  }
+};
+
+// Ref(variable) matches any argument that is a reference to
+// 'variable'.  This matcher is polymorphic as it can match any
+// super type of the type of 'variable'.
+//
+// The RefMatcher template class implements Ref(variable).  It can
+// only be instantiated with a reference type.  This prevents a user
+// from mistakenly using Ref(x) to match a non-reference function
+// argument.  For example, the following will righteously cause a
+// compiler error:
+//
+//   int n;
+//   Matcher<int> m1 = Ref(n);   // This won't compile.
+//   Matcher<int&> m2 = Ref(n);  // This will compile.
+template <typename T>
+class RefMatcher;
+
+template <typename T>
+class RefMatcher<T&> {
+  // Google Mock is a generic framework and thus needs to support
+  // mocking any function types, including those that take non-const
+  // reference arguments.  Therefore the template parameter T (and
+  // Super below) can be instantiated to either a const type or a
+  // non-const type.
+ public:
+  // RefMatcher() takes a T& instead of const T&, as we want the
+  // compiler to catch using Ref(const_value) as a matcher for a
+  // non-const reference.
+  explicit RefMatcher(T& x) : object_(x) {}  // NOLINT
+
+  template <typename Super>
+  operator Matcher<Super&>() const {
+    // By passing object_ (type T&) to Impl(), which expects a Super&,
+    // we make sure that Super is a super type of T.  In particular,
+    // this catches using Ref(const_value) as a matcher for a
+    // non-const reference, as you cannot implicitly convert a const
+    // reference to a non-const reference.
+    return MakeMatcher(new Impl<Super>(object_));
+  }
+
+ private:
+  template <typename Super>
+  class Impl : public MatcherInterface<Super&> {
+   public:
+    explicit Impl(Super& x) : object_(x) {}  // NOLINT
+
+    // MatchAndExplain() takes a Super& (as opposed to const Super&)
+    // in order to match the interface MatcherInterface<Super&>.
+    virtual bool MatchAndExplain(
+        Super& x, MatchResultListener* listener) const {
+      *listener << "which is located @" << static_cast<const void*>(&x);
+      return &x == &object_;
+    }
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "references the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "does not reference the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+   private:
+    const Super& object_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& object_;
+
+  GTEST_DISALLOW_ASSIGN_(RefMatcher);
+};
+
+// Polymorphic helper functions for narrow and wide string matchers.
+inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+  return String::CaseInsensitiveCStringEquals(lhs, rhs);
+}
+
+inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
+                                         const wchar_t* rhs) {
+  return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
+}
+
+// String comparison for narrow or wide strings that can have embedded NUL
+// characters.
+template <typename StringType>
+bool CaseInsensitiveStringEquals(const StringType& s1,
+                                 const StringType& s2) {
+  // Are the heads equal?
+  if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
+    return false;
+  }
+
+  // Skip the equal heads.
+  const typename StringType::value_type nul = 0;
+  const size_t i1 = s1.find(nul), i2 = s2.find(nul);
+
+  // Are we at the end of either s1 or s2?
+  if (i1 == StringType::npos || i2 == StringType::npos) {
+    return i1 == i2;
+  }
+
+  // Are the tails equal?
+  return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
+}
+
+// String matchers.
+
+// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
+template <typename StringType>
+class StrEqualityMatcher {
+ public:
+  StrEqualityMatcher(const StringType& str, bool expect_eq,
+                     bool case_sensitive)
+      : string_(str), expect_eq_(expect_eq), case_sensitive_(case_sensitive) {}
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    if (s == NULL) {
+      return !expect_eq_;
+    }
+    return MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringPiece has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    const bool eq = case_sensitive_ ? s2 == string_ :
+        CaseInsensitiveStringEquals(s2, string_);
+    return expect_eq_ == eq;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    DescribeToHelper(expect_eq_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    DescribeToHelper(!expect_eq_, os);
+  }
+
+ private:
+  void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
+    *os << (expect_eq ? "is " : "isn't ");
+    *os << "equal to ";
+    if (!case_sensitive_) {
+      *os << "(ignoring case) ";
+    }
+    UniversalPrint(string_, os);
+  }
+
+  const StringType string_;
+  const bool expect_eq_;
+  const bool case_sensitive_;
+
+  GTEST_DISALLOW_ASSIGN_(StrEqualityMatcher);
+};
+
+// Implements the polymorphic HasSubstr(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class HasSubstrMatcher {
+ public:
+  explicit HasSubstrMatcher(const StringType& substring)
+      : substring_(substring) {}
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != NULL && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringPiece has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.find(substring_) != StringType::npos;
+  }
+
+  // Describes what this matcher matches.
+  void DescribeTo(::std::ostream* os) const {
+    *os << "has substring ";
+    UniversalPrint(substring_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "has no substring ";
+    UniversalPrint(substring_, os);
+  }
+
+ private:
+  const StringType substring_;
+
+  GTEST_DISALLOW_ASSIGN_(HasSubstrMatcher);
+};
+
+// Implements the polymorphic StartsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class StartsWithMatcher {
+ public:
+  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {
+  }
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != NULL && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringPiece has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= prefix_.length() &&
+        s2.substr(0, prefix_.length()) == prefix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "starts with ";
+    UniversalPrint(prefix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't start with ";
+    UniversalPrint(prefix_, os);
+  }
+
+ private:
+  const StringType prefix_;
+
+  GTEST_DISALLOW_ASSIGN_(StartsWithMatcher);
+};
+
+// Implements the polymorphic EndsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class EndsWithMatcher {
+ public:
+  explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != NULL && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringPiece has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= suffix_.length() &&
+        s2.substr(s2.length() - suffix_.length()) == suffix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "ends with ";
+    UniversalPrint(suffix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't end with ";
+    UniversalPrint(suffix_, os);
+  }
+
+ private:
+  const StringType suffix_;
+
+  GTEST_DISALLOW_ASSIGN_(EndsWithMatcher);
+};
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+  MatchesRegexMatcher(const RE* regex, bool full_match)
+      : regex_(regex), full_match_(full_match) {}
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != NULL && MatchAndExplain(internal::string(s), listener);
+  }
+
+  // Matches anything that can convert to internal::string.
+  //
+  // This is a template, not just a plain function with const internal::string&,
+  // because StringPiece has some interfering non-explicit constructors.
+  template <class MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const internal::string& s2(s);
+    return full_match_ ? RE::FullMatch(s2, *regex_) :
+        RE::PartialMatch(s2, *regex_);
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << (full_match_ ? "matches" : "contains")
+        << " regular expression ";
+    UniversalPrinter<internal::string>::Print(regex_->pattern(), os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't " << (full_match_ ? "match" : "contain")
+        << " regular expression ";
+    UniversalPrinter<internal::string>::Print(regex_->pattern(), os);
+  }
+
+ private:
+  const internal::linked_ptr<const RE> regex_;
+  const bool full_match_;
+
+  GTEST_DISALLOW_ASSIGN_(MatchesRegexMatcher);
+};
+
+// Implements a matcher that compares the two fields of a 2-tuple
+// using one of the ==, <=, <, etc, operators.  The two fields being
+// compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq() can be
+// used to match a tuple<int, short>, a tuple<const long&, double>,
+// etc).  Therefore we use a template type conversion operator in the
+// implementation.
+template <typename D, typename Op>
+class PairMatchBase {
+ public:
+  template <typename T1, typename T2>
+  operator Matcher< ::testing::tuple<T1, T2> >() const {
+    return MakeMatcher(new Impl< ::testing::tuple<T1, T2> >);
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::testing::tuple<T1, T2>&>() const {
+    return MakeMatcher(new Impl<const ::testing::tuple<T1, T2>&>);
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << D::Desc();
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    virtual bool MatchAndExplain(
+        Tuple args,
+        MatchResultListener* /* listener */) const {
+      return Op()(::testing::get<0>(args), ::testing::get<1>(args));
+    }
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "are " << GetDesc;
+    }
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "aren't " << GetDesc;
+    }
+  };
+};
+
+class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
+ public:
+  static const char* Desc() { return "an equal pair"; }
+};
+class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
+ public:
+  static const char* Desc() { return "an unequal pair"; }
+};
+class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
+ public:
+  static const char* Desc() { return "a pair where the first < the second"; }
+};
+class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
+ public:
+  static const char* Desc() { return "a pair where the first > the second"; }
+};
+class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
+ public:
+  static const char* Desc() { return "a pair where the first <= the second"; }
+};
+class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
+ public:
+  static const char* Desc() { return "a pair where the first >= the second"; }
+};
+
+// Implements the Not(...) matcher for a particular argument type T.
+// We do not nest it inside the NotMatcher class template, as that
+// will prevent different instantiations of NotMatcher from sharing
+// the same NotMatcherImpl<T> class.
+template <typename T>
+class NotMatcherImpl : public MatcherInterface<T> {
+ public:
+  explicit NotMatcherImpl(const Matcher<T>& matcher)
+      : matcher_(matcher) {}
+
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+    return !matcher_.MatchAndExplain(x, listener);
+  }
+
+  virtual void DescribeTo(::std::ostream* os) const {
+    matcher_.DescribeNegationTo(os);
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<T> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcherImpl);
+};
+
+// Implements the Not(m) matcher, which matches a value that doesn't
+// match matcher m.
+template <typename InnerMatcher>
+class NotMatcher {
+ public:
+  explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
+
+  // This template type conversion operator allows Not(m) to be used
+  // to match any type m can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
+  }
+
+ private:
+  InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcher);
+};
+
+// Implements the AllOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the BothOfMatcher class template, as
+// that will prevent different instantiations of BothOfMatcher from
+// sharing the same BothOfMatcherImpl<T> class.
+template <typename T>
+class BothOfMatcherImpl : public MatcherInterface<T> {
+ public:
+  BothOfMatcherImpl(const Matcher<T>& matcher1, const Matcher<T>& matcher2)
+      : matcher1_(matcher1), matcher2_(matcher2) {}
+
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "(";
+    matcher1_.DescribeTo(os);
+    *os << ") and (";
+    matcher2_.DescribeTo(os);
+    *os << ")";
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "(";
+    matcher1_.DescribeNegationTo(os);
+    *os << ") or (";
+    matcher2_.DescribeNegationTo(os);
+    *os << ")";
+  }
+
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+    // If either matcher1_ or matcher2_ doesn't match x, we only need
+    // to explain why one of them fails.
+    StringMatchResultListener listener1;
+    if (!matcher1_.MatchAndExplain(x, &listener1)) {
+      *listener << listener1.str();
+      return false;
+    }
+
+    StringMatchResultListener listener2;
+    if (!matcher2_.MatchAndExplain(x, &listener2)) {
+      *listener << listener2.str();
+      return false;
+    }
+
+    // Otherwise we need to explain why *both* of them match.
+    const internal::string s1 = listener1.str();
+    const internal::string s2 = listener2.str();
+
+    if (s1 == "") {
+      *listener << s2;
+    } else {
+      *listener << s1;
+      if (s2 != "") {
+        *listener << ", and " << s2;
+      }
+    }
+    return true;
+  }
+
+ private:
+  const Matcher<T> matcher1_;
+  const Matcher<T> matcher2_;
+
+  GTEST_DISALLOW_ASSIGN_(BothOfMatcherImpl);
+};
+
+#if GTEST_LANG_CXX11
+// MatcherList provides mechanisms for storing a variable number of matchers in
+// a list structure (ListType) and creating a combining matcher from such a
+// list.
+// The template is defined recursively using the following template paramters:
+//   * kSize is the length of the MatcherList.
+//   * Head is the type of the first matcher of the list.
+//   * Tail denotes the types of the remaining matchers of the list.
+template <int kSize, typename Head, typename... Tail>
+struct MatcherList {
+  typedef MatcherList<kSize - 1, Tail...> MatcherListTail;
+  typedef ::std::pair<Head, typename MatcherListTail::ListType> ListType;
+
+  // BuildList stores variadic type values in a nested pair structure.
+  // Example:
+  // MatcherList<3, int, string, float>::BuildList(5, "foo", 2.0) will return
+  // the corresponding result of type pair<int, pair<string, float>>.
+  static ListType BuildList(const Head& matcher, const Tail&... tail) {
+    return ListType(matcher, MatcherListTail::BuildList(tail...));
+  }
+
+  // CreateMatcher<T> creates a Matcher<T> from a given list of matchers (built
+  // by BuildList()). CombiningMatcher<T> is used to combine the matchers of the
+  // list. CombiningMatcher<T> must implement MatcherInterface<T> and have a
+  // constructor taking two Matcher<T>s as input.
+  template <typename T, template <typename /* T */> class CombiningMatcher>
+  static Matcher<T> CreateMatcher(const ListType& matchers) {
+    return Matcher<T>(new CombiningMatcher<T>(
+        SafeMatcherCast<T>(matchers.first),
+        MatcherListTail::template CreateMatcher<T, CombiningMatcher>(
+            matchers.second)));
+  }
+};
+
+// The following defines the base case for the recursive definition of
+// MatcherList.
+template <typename Matcher1, typename Matcher2>
+struct MatcherList<2, Matcher1, Matcher2> {
+  typedef ::std::pair<Matcher1, Matcher2> ListType;
+
+  static ListType BuildList(const Matcher1& matcher1,
+                            const Matcher2& matcher2) {
+    return ::std::pair<Matcher1, Matcher2>(matcher1, matcher2);
+  }
+
+  template <typename T, template <typename /* T */> class CombiningMatcher>
+  static Matcher<T> CreateMatcher(const ListType& matchers) {
+    return Matcher<T>(new CombiningMatcher<T>(
+        SafeMatcherCast<T>(matchers.first),
+        SafeMatcherCast<T>(matchers.second)));
+  }
+};
+
+// VariadicMatcher is used for the variadic implementation of
+// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
+// CombiningMatcher<T> is used to recursively combine the provided matchers
+// (of type Args...).
+template <template <typename T> class CombiningMatcher, typename... Args>
+class VariadicMatcher {
+ public:
+  VariadicMatcher(const Args&... matchers)  // NOLINT
+      : matchers_(MatcherListType::BuildList(matchers...)) {}
+
+  // This template type conversion operator allows an
+  // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
+  // all of the provided matchers (Matcher1, Matcher2, ...) can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return MatcherListType::template CreateMatcher<T, CombiningMatcher>(
+        matchers_);
+  }
+
+ private:
+  typedef MatcherList<sizeof...(Args), Args...> MatcherListType;
+
+  const typename MatcherListType::ListType matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(VariadicMatcher);
+};
+
+template <typename... Args>
+using AllOfMatcher = VariadicMatcher<BothOfMatcherImpl, Args...>;
+
+#endif  // GTEST_LANG_CXX11
+
+// Used for implementing the AllOf(m_1, ..., m_n) matcher, which
+// matches a value that matches all of the matchers m_1, ..., and m_n.
+template <typename Matcher1, typename Matcher2>
+class BothOfMatcher {
+ public:
+  BothOfMatcher(Matcher1 matcher1, Matcher2 matcher2)
+      : matcher1_(matcher1), matcher2_(matcher2) {}
+
+  // This template type conversion operator allows a
+  // BothOfMatcher<Matcher1, Matcher2> object to match any type that
+  // both Matcher1 and Matcher2 can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new BothOfMatcherImpl<T>(SafeMatcherCast<T>(matcher1_),
+                                               SafeMatcherCast<T>(matcher2_)));
+  }
+
+ private:
+  Matcher1 matcher1_;
+  Matcher2 matcher2_;
+
+  GTEST_DISALLOW_ASSIGN_(BothOfMatcher);
+};
+
+// Implements the AnyOf(m1, m2) matcher for a particular argument type
+// T.  We do not nest it inside the AnyOfMatcher class template, as
+// that will prevent different instantiations of AnyOfMatcher from
+// sharing the same EitherOfMatcherImpl<T> class.
+template <typename T>
+class EitherOfMatcherImpl : public MatcherInterface<T> {
+ public:
+  EitherOfMatcherImpl(const Matcher<T>& matcher1, const Matcher<T>& matcher2)
+      : matcher1_(matcher1), matcher2_(matcher2) {}
+
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "(";
+    matcher1_.DescribeTo(os);
+    *os << ") or (";
+    matcher2_.DescribeTo(os);
+    *os << ")";
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "(";
+    matcher1_.DescribeNegationTo(os);
+    *os << ") and (";
+    matcher2_.DescribeNegationTo(os);
+    *os << ")";
+  }
+
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+    // If either matcher1_ or matcher2_ matches x, we just need to
+    // explain why *one* of them matches.
+    StringMatchResultListener listener1;
+    if (matcher1_.MatchAndExplain(x, &listener1)) {
+      *listener << listener1.str();
+      return true;
+    }
+
+    StringMatchResultListener listener2;
+    if (matcher2_.MatchAndExplain(x, &listener2)) {
+      *listener << listener2.str();
+      return true;
+    }
+
+    // Otherwise we need to explain why *both* of them fail.
+    const internal::string s1 = listener1.str();
+    const internal::string s2 = listener2.str();
+
+    if (s1 == "") {
+      *listener << s2;
+    } else {
+      *listener << s1;
+      if (s2 != "") {
+        *listener << ", and " << s2;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const Matcher<T> matcher1_;
+  const Matcher<T> matcher2_;
+
+  GTEST_DISALLOW_ASSIGN_(EitherOfMatcherImpl);
+};
+
+#if GTEST_LANG_CXX11
+// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
+template <typename... Args>
+using AnyOfMatcher = VariadicMatcher<EitherOfMatcherImpl, Args...>;
+
+#endif  // GTEST_LANG_CXX11
+
+// Used for implementing the AnyOf(m_1, ..., m_n) matcher, which
+// matches a value that matches at least one of the matchers m_1, ...,
+// and m_n.
+template <typename Matcher1, typename Matcher2>
+class EitherOfMatcher {
+ public:
+  EitherOfMatcher(Matcher1 matcher1, Matcher2 matcher2)
+      : matcher1_(matcher1), matcher2_(matcher2) {}
+
+  // This template type conversion operator allows a
+  // EitherOfMatcher<Matcher1, Matcher2> object to match any type that
+  // both Matcher1 and Matcher2 can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new EitherOfMatcherImpl<T>(
+        SafeMatcherCast<T>(matcher1_), SafeMatcherCast<T>(matcher2_)));
+  }
+
+ private:
+  Matcher1 matcher1_;
+  Matcher2 matcher2_;
+
+  GTEST_DISALLOW_ASSIGN_(EitherOfMatcher);
+};
+
+// Used for implementing Truly(pred), which turns a predicate into a
+// matcher.
+template <typename Predicate>
+class TrulyMatcher {
+ public:
+  explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
+
+  // This method template allows Truly(pred) to be used as a matcher
+  // for type T where T is the argument type of predicate 'pred'.  The
+  // argument is passed by reference as the predicate may be
+  // interested in the address of the argument.
+  template <typename T>
+  bool MatchAndExplain(T& x,  // NOLINT
+                       MatchResultListener* /* listener */) const {
+    // Without the if-statement, MSVC sometimes warns about converting
+    // a value to bool (warning 4800).
+    //
+    // We cannot write 'return !!predicate_(x);' as that doesn't work
+    // when predicate_(x) returns a class convertible to bool but
+    // having no operator!().
+    if (predicate_(x))
+      return true;
+    return false;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "satisfies the given predicate";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't satisfy the given predicate";
+  }
+
+ private:
+  Predicate predicate_;
+
+  GTEST_DISALLOW_ASSIGN_(TrulyMatcher);
+};
+
+// Used for implementing Matches(matcher), which turns a matcher into
+// a predicate.
+template <typename M>
+class MatcherAsPredicate {
+ public:
+  explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
+
+  // This template operator() allows Matches(m) to be used as a
+  // predicate on type T where m is a matcher on type T.
+  //
+  // The argument x is passed by reference instead of by value, as
+  // some matcher may be interested in its address (e.g. as in
+  // Matches(Ref(n))(x)).
+  template <typename T>
+  bool operator()(const T& x) const {
+    // We let matcher_ commit to a particular type here instead of
+    // when the MatcherAsPredicate object was constructed.  This
+    // allows us to write Matches(m) where m is a polymorphic matcher
+    // (e.g. Eq(5)).
+    //
+    // If we write Matcher<T>(matcher_).Matches(x) here, it won't
+    // compile when matcher_ has type Matcher<const T&>; if we write
+    // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
+    // when matcher_ has type Matcher<T>; if we just write
+    // matcher_.Matches(x), it won't compile when matcher_ is
+    // polymorphic, e.g. Eq(5).
+    //
+    // MatcherCast<const T&>() is necessary for making the code work
+    // in all of the above situations.
+    return MatcherCast<const T&>(matcher_).Matches(x);
+  }
+
+ private:
+  M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(MatcherAsPredicate);
+};
+
+// For implementing ASSERT_THAT() and EXPECT_THAT().  The template
+// argument M must be a type that can be converted to a matcher.
+template <typename M>
+class PredicateFormatterFromMatcher {
+ public:
+  explicit PredicateFormatterFromMatcher(M m) : matcher_(internal::move(m)) {}
+
+  // This template () operator allows a PredicateFormatterFromMatcher
+  // object to act as a predicate-formatter suitable for using with
+  // Google Test's EXPECT_PRED_FORMAT1() macro.
+  template <typename T>
+  AssertionResult operator()(const char* value_text, const T& x) const {
+    // We convert matcher_ to a Matcher<const T&> *now* instead of
+    // when the PredicateFormatterFromMatcher object was constructed,
+    // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
+    // know which type to instantiate it to until we actually see the
+    // type of x here.
+    //
+    // We write SafeMatcherCast<const T&>(matcher_) instead of
+    // Matcher<const T&>(matcher_), as the latter won't compile when
+    // matcher_ has type Matcher<T> (e.g. An<int>()).
+    // We don't write MatcherCast<const T&> either, as that allows
+    // potentially unsafe downcasting of the matcher argument.
+    const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
+    StringMatchResultListener listener;
+    if (MatchPrintAndExplain(x, matcher, &listener))
+      return AssertionSuccess();
+
+    ::std::stringstream ss;
+    ss << "Value of: " << value_text << "\n"
+       << "Expected: ";
+    matcher.DescribeTo(&ss);
+    ss << "\n  Actual: " << listener.str();
+    return AssertionFailure() << ss.str();
+  }
+
+ private:
+  const M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PredicateFormatterFromMatcher);
+};
+
+// A helper function for converting a matcher to a predicate-formatter
+// without the user needing to explicitly write the type.  This is
+// used for implementing ASSERT_THAT() and EXPECT_THAT().
+// Implementation detail: 'matcher' is received by-value to force decaying.
+template <typename M>
+inline PredicateFormatterFromMatcher<M>
+MakePredicateFormatterFromMatcher(M matcher) {
+  return PredicateFormatterFromMatcher<M>(internal::move(matcher));
+}
+
+// Implements the polymorphic floating point equality matcher, which matches
+// two float values using ULP-based approximation or, optionally, a
+// user-specified epsilon.  The template is meant to be instantiated with
+// FloatType being either float or double.
+template <typename FloatType>
+class FloatingEqMatcher {
+ public:
+  // Constructor for FloatingEqMatcher.
+  // The matcher's input will be compared with expected.  The matcher treats two
+  // NANs as equal if nan_eq_nan is true.  Otherwise, under IEEE standards,
+  // equality comparisons between NANs will always return false.  We specify a
+  // negative max_abs_error_ term to indicate that ULP-based approximation will
+  // be used for comparison.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan) :
+    expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {
+  }
+
+  // Constructor that supports a user-specified max_abs_error that will be used
+  // for comparison instead of ULP-based approximation.  The max absolute
+  // should be non-negative.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
+                    FloatType max_abs_error)
+      : expected_(expected),
+        nan_eq_nan_(nan_eq_nan),
+        max_abs_error_(max_abs_error) {
+    GTEST_CHECK_(max_abs_error >= 0)
+        << ", where max_abs_error is" << max_abs_error;
+  }
+
+  // Implements floating point equality matcher as a Matcher<T>.
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
+        : expected_(expected),
+          nan_eq_nan_(nan_eq_nan),
+          max_abs_error_(max_abs_error) {}
+
+    virtual bool MatchAndExplain(T value,
+                                 MatchResultListener* listener) const {
+      const FloatingPoint<FloatType> actual(value), expected(expected_);
+
+      // Compares NaNs first, if nan_eq_nan_ is true.
+      if (actual.is_nan() || expected.is_nan()) {
+        if (actual.is_nan() && expected.is_nan()) {
+          return nan_eq_nan_;
+        }
+        // One is nan; the other is not nan.
+        return false;
+      }
+      if (HasMaxAbsError()) {
+        // We perform an equality check so that inf will match inf, regardless
+        // of error bounds.  If the result of value - expected_ would result in
+        // overflow or if either value is inf, the default result is infinity,
+        // which should only match if max_abs_error_ is also infinity.
+        if (value == expected_) {
+          return true;
+        }
+
+        const FloatType diff = value - expected_;
+        if (fabs(diff) <= max_abs_error_) {
+          return true;
+        }
+
+        if (listener->IsInterested()) {
+          *listener << "which is " << diff << " from " << expected_;
+        }
+        return false;
+      } else {
+        return actual.AlmostEquals(expected);
+      }
+    }
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      // os->precision() returns the previously set precision, which we
+      // store to restore the ostream to its original configuration
+      // after outputting.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "is NaN";
+        } else {
+          *os << "never matches";
+        }
+      } else {
+        *os << "is approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error <= " << max_abs_error_ << ")";
+        }
+      }
+      os->precision(old_precision);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      // As before, get original precision.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "isn't NaN";
+        } else {
+          *os << "is anything";
+        }
+      } else {
+        *os << "isn't approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error > " << max_abs_error_ << ")";
+        }
+      }
+      // Restore original precision.
+      os->precision(old_precision);
+    }
+
+   private:
+    bool HasMaxAbsError() const {
+      return max_abs_error_ >= 0;
+    }
+
+    const FloatType expected_;
+    const bool nan_eq_nan_;
+    // max_abs_error will be used for value comparison when >= 0.
+    const FloatType max_abs_error_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  // The following 3 type conversion operators allow FloatEq(expected) and
+  // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
+  // Matcher<const float&>, or a Matcher<float&>, but nothing else.
+  // (While Google's C++ coding style doesn't allow arguments passed
+  // by non-const reference, we may see them in code not conforming to
+  // the style.  Therefore Google Mock needs to support them.)
+  operator Matcher<FloatType>() const {
+    return MakeMatcher(
+        new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<const FloatType&>() const {
+    return MakeMatcher(
+        new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<FloatType&>() const {
+    return MakeMatcher(
+        new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+ private:
+  const FloatType expected_;
+  const bool nan_eq_nan_;
+  // max_abs_error will be used for value comparison when >= 0.
+  const FloatType max_abs_error_;
+
+  GTEST_DISALLOW_ASSIGN_(FloatingEqMatcher);
+};
+
+// Implements the Pointee(m) matcher for matching a pointer whose
+// pointee matches matcher m.  The pointer can be either raw or smart.
+template <typename InnerMatcher>
+class PointeeMatcher {
+ public:
+  explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+  // This type conversion operator template allows Pointee(m) to be
+  // used as a matcher for any pointer type whose pointee type is
+  // compatible with the inner matcher, where type Pointer can be
+  // either a raw pointer or a smart pointer.
+  //
+  // The reason we do this instead of relying on
+  // MakePolymorphicMatcher() is that the latter is not flexible
+  // enough for implementing the DescribeTo() method of Pointee().
+  template <typename Pointer>
+  operator Matcher<Pointer>() const {
+    return MakeMatcher(new Impl<Pointer>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular pointer type.
+  template <typename Pointer>
+  class Impl : public MatcherInterface<Pointer> {
+   public:
+    typedef typename PointeeOf<GTEST_REMOVE_CONST_(  // NOLINT
+        GTEST_REMOVE_REFERENCE_(Pointer))>::type Pointee;
+
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<const Pointee&>(matcher)) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "points to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "does not point to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    virtual bool MatchAndExplain(Pointer pointer,
+                                 MatchResultListener* listener) const {
+      if (GetRawPointer(pointer) == NULL)
+        return false;
+
+      *listener << "which points to ";
+      return MatchPrintAndExplain(*pointer, matcher_, listener);
+    }
+
+   private:
+    const Matcher<const Pointee&> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PointeeMatcher);
+};
+
+// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
+// reference that matches inner_matcher when dynamic_cast<T> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+class WhenDynamicCastToMatcherBase {
+ public:
+  explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
+      : matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeNegationTo(os);
+  }
+
+ protected:
+  const Matcher<To> matcher_;
+
+  static string GetToName() {
+#if GTEST_HAS_RTTI
+    return GetTypeName<To>();
+#else  // GTEST_HAS_RTTI
+    return "the target type";
+#endif  // GTEST_HAS_RTTI
+  }
+
+ private:
+  static void GetCastTypeDescription(::std::ostream* os) {
+    *os << "when dynamic_cast to " << GetToName() << ", ";
+  }
+
+  GTEST_DISALLOW_ASSIGN_(WhenDynamicCastToMatcherBase);
+};
+
+// Primary template.
+// To is a pointer. Cast and forward the result.
+template <typename To>
+class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
+      : WhenDynamicCastToMatcherBase<To>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From from, MatchResultListener* listener) const {
+    // TODO(sbenza): Add more detail on failures. ie did the dyn_cast fail?
+    To to = dynamic_cast<To>(from);
+    return MatchPrintAndExplain(to, this->matcher_, listener);
+  }
+};
+
+// Specialize for references.
+// In this case we return false if the dynamic_cast fails.
+template <typename To>
+class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
+      : WhenDynamicCastToMatcherBase<To&>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From& from, MatchResultListener* listener) const {
+    // We don't want an std::bad_cast here, so do the cast with pointers.
+    To* to = dynamic_cast<To*>(&from);
+    if (to == NULL) {
+      *listener << "which cannot be dynamic_cast to " << this->GetToName();
+      return false;
+    }
+    return MatchPrintAndExplain(*to, this->matcher_, listener);
+  }
+};
+
+// Implements the Field() matcher for matching a field (i.e. member
+// variable) of an object.
+template <typename Class, typename FieldType>
+class FieldMatcher {
+ public:
+  FieldMatcher(FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field), matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object whose given field ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object whose given field ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+    return MatchAndExplainImpl(
+        typename ::testing::internal::
+            is_pointer<GTEST_REMOVE_CONST_(T)>::type(),
+        value, listener);
+  }
+
+ private:
+  // The first argument of MatchAndExplainImpl() is needed to help
+  // Symbian's C++ compiler choose which overload to use.  Its type is
+  // true_type iff the Field() matcher is used to match a pointer.
+  bool MatchAndExplainImpl(false_type /* is_not_pointer */, const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << "whose given field is ";
+    return MatchPrintAndExplain(obj.*field_, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == NULL)
+      return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a field, it must be a class/struct/union type and
+    // thus cannot be a pointer.  Therefore we pass false_type() as
+    // the first argument.
+    return MatchAndExplainImpl(false_type(), *p, listener);
+  }
+
+  const FieldType Class::*field_;
+  const Matcher<const FieldType&> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(FieldMatcher);
+};
+
+// Implements the Property() matcher for matching a property
+// (i.e. return value of a getter method) of an object.
+template <typename Class, typename PropertyType>
+class PropertyMatcher {
+ public:
+  // The property may have a reference type, so 'const PropertyType&'
+  // may cause double references and fail to compile.  That's why we
+  // need GTEST_REFERENCE_TO_CONST, which works regardless of
+  // PropertyType being a reference or not.
+  typedef GTEST_REFERENCE_TO_CONST_(PropertyType) RefToConstProperty;
+
+  PropertyMatcher(PropertyType (Class::*property)() const,
+                  const Matcher<RefToConstProperty>& matcher)
+      : property_(property), matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object whose given property ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object whose given property ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T&value, MatchResultListener* listener) const {
+    return MatchAndExplainImpl(
+        typename ::testing::internal::
+            is_pointer<GTEST_REMOVE_CONST_(T)>::type(),
+        value, listener);
+  }
+
+ private:
+  // The first argument of MatchAndExplainImpl() is needed to help
+  // Symbian's C++ compiler choose which overload to use.  Its type is
+  // true_type iff the Property() matcher is used to match a pointer.
+  bool MatchAndExplainImpl(false_type /* is_not_pointer */, const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << "whose given property is ";
+    // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
+    // which takes a non-const reference as argument.
+#if defined(_PREFAST_ ) && _MSC_VER == 1800
+    // Workaround bug in VC++ 2013's /analyze parser.
+    // https://connect.microsoft.com/VisualStudio/feedback/details/1106363/internal-compiler-error-with-analyze-due-to-failure-to-infer-move
+    posix::Abort();  // To make sure it is never run.
+    return false;
+#else
+    RefToConstProperty result = (obj.*property_)();
+    return MatchPrintAndExplain(result, matcher_, listener);
+#endif
+  }
+
+  bool MatchAndExplainImpl(true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == NULL)
+      return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a property method, it must be a class/struct/union
+    // type and thus cannot be a pointer.  Therefore we pass
+    // false_type() as the first argument.
+    return MatchAndExplainImpl(false_type(), *p, listener);
+  }
+
+  PropertyType (Class::*property_)() const;
+  const Matcher<RefToConstProperty> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PropertyMatcher);
+};
+
+// Type traits specifying various features of different functors for ResultOf.
+// The default template specifies features for functor objects.
+// Functor classes have to typedef argument_type and result_type
+// to be compatible with ResultOf.
+template <typename Functor>
+struct CallableTraits {
+  typedef typename Functor::result_type ResultType;
+  typedef Functor StorageType;
+
+  static void CheckIsValid(Functor /* functor */) {}
+  template <typename T>
+  static ResultType Invoke(Functor f, T arg) { return f(arg); }
+};
+
+// Specialization for function pointers.
+template <typename ArgType, typename ResType>
+struct CallableTraits<ResType(*)(ArgType)> {
+  typedef ResType ResultType;
+  typedef ResType(*StorageType)(ArgType);
+
+  static void CheckIsValid(ResType(*f)(ArgType)) {
+    GTEST_CHECK_(f != NULL)
+        << "NULL function pointer is passed into ResultOf().";
+  }
+  template <typename T>
+  static ResType Invoke(ResType(*f)(ArgType), T arg) {
+    return (*f)(arg);
+  }
+};
+
+// Implements the ResultOf() matcher for matching a return value of a
+// unary function of an object.
+template <typename Callable>
+class ResultOfMatcher {
+ public:
+  typedef typename CallableTraits<Callable>::ResultType ResultType;
+
+  ResultOfMatcher(Callable callable, const Matcher<ResultType>& matcher)
+      : callable_(callable), matcher_(matcher) {
+    CallableTraits<Callable>::CheckIsValid(callable_);
+  }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new Impl<T>(callable_, matcher_));
+  }
+
+ private:
+  typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
+
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    Impl(CallableStorageType callable, const Matcher<ResultType>& matcher)
+        : callable_(callable), matcher_(matcher) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(T obj, MatchResultListener* listener) const {
+      *listener << "which is mapped by the given callable to ";
+      // Cannot pass the return value (for example, int) to
+      // MatchPrintAndExplain, which takes a non-const reference as argument.
+      ResultType result =
+          CallableTraits<Callable>::template Invoke<T>(callable_, obj);
+      return MatchPrintAndExplain(result, matcher_, listener);
+    }
+
+   private:
+    // Functors often define operator() as non-const method even though
+    // they are actualy stateless. But we need to use them even when
+    // 'this' is a const pointer. It's the user's responsibility not to
+    // use stateful callables with ResultOf(), which does't guarantee
+    // how many times the callable will be invoked.
+    mutable CallableStorageType callable_;
+    const Matcher<ResultType> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };  // class Impl
+
+  const CallableStorageType callable_;
+  const Matcher<ResultType> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ResultOfMatcher);
+};
+
+// Implements a matcher that checks the size of an STL-style container.
+template <typename SizeMatcher>
+class SizeIsMatcher {
+ public:
+  explicit SizeIsMatcher(const SizeMatcher& size_matcher)
+       : size_matcher_(size_matcher) {
+  }
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(new Impl<Container>(size_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(Container)> ContainerView;
+    typedef typename ContainerView::type::size_type SizeType;
+    explicit Impl(const SizeMatcher& size_matcher)
+        : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "size ";
+      size_matcher_.DescribeTo(os);
+    }
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "size ";
+      size_matcher_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(Container container,
+                                 MatchResultListener* listener) const {
+      SizeType size = container.size();
+      StringMatchResultListener size_listener;
+      const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
+      *listener
+          << "whose size " << size << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(size_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<SizeType> size_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const SizeMatcher size_matcher_;
+  GTEST_DISALLOW_ASSIGN_(SizeIsMatcher);
+};
+
+// Implements a matcher that checks the begin()..end() distance of an STL-style
+// container.
+template <typename DistanceMatcher>
+class BeginEndDistanceIsMatcher {
+ public:
+  explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
+      : distance_matcher_(distance_matcher) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(new Impl<Container>(distance_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef internal::StlContainerView<RawContainer> View;
+    typedef typename View::type StlContainer;
+    typedef typename View::const_reference StlContainerReference;
+    typedef decltype(std::begin(
+        std::declval<StlContainerReference>())) StlContainerConstIterator;
+    typedef typename std::iterator_traits<
+        StlContainerConstIterator>::difference_type DistanceType;
+    explicit Impl(const DistanceMatcher& distance_matcher)
+        : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeTo(os);
+    }
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(Container container,
+                                 MatchResultListener* listener) const {
+#if GTEST_HAS_STD_BEGIN_AND_END_
+      using std::begin;
+      using std::end;
+      DistanceType distance = std::distance(begin(container), end(container));
+#else
+      DistanceType distance = std::distance(container.begin(), container.end());
+#endif
+      StringMatchResultListener distance_listener;
+      const bool result =
+          distance_matcher_.MatchAndExplain(distance, &distance_listener);
+      *listener << "whose distance between begin() and end() " << distance
+                << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(distance_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<DistanceType> distance_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const DistanceMatcher distance_matcher_;
+  GTEST_DISALLOW_ASSIGN_(BeginEndDistanceIsMatcher);
+};
+
+// Implements an equality matcher for any STL-style container whose elements
+// support ==. This matcher is like Eq(), but its failure explanations provide
+// more detailed information that is useful when the container is used as a set.
+// The failure message reports elements that are in one of the operands but not
+// the other. The failure messages do not report duplicate or out-of-order
+// elements in the containers (which don't properly matter to sets, but can
+// occur if the containers are vectors or lists, for example).
+//
+// Uses the container's const_iterator, value_type, operator ==,
+// begin(), and end().
+template <typename Container>
+class ContainerEqMatcher {
+ public:
+  typedef internal::StlContainerView<Container> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+
+  // We make a copy of expected in case the elements in it are modified
+  // after this matcher is created.
+  explicit ContainerEqMatcher(const Container& expected)
+      : expected_(View::Copy(expected)) {
+    // Makes sure the user doesn't instantiate this class template
+    // with a const or reference type.
+    (void)testing::StaticAssertTypeEq<Container,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>();
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "equals ";
+    UniversalPrint(expected_, os);
+  }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "does not equal ";
+    UniversalPrint(expected_, os);
+  }
+
+  template <typename LhsContainer>
+  bool MatchAndExplain(const LhsContainer& lhs,
+                       MatchResultListener* listener) const {
+    // GTEST_REMOVE_CONST_() is needed to work around an MSVC 8.0 bug
+    // that causes LhsContainer to be a const type sometimes.
+    typedef internal::StlContainerView<GTEST_REMOVE_CONST_(LhsContainer)>
+        LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+    if (lhs_stl_container == expected_)
+      return true;
+
+    ::std::ostream* const os = listener->stream();
+    if (os != NULL) {
+      // Something is different. Check for extra values first.
+      bool printed_header = false;
+      for (typename LhsStlContainer::const_iterator it =
+               lhs_stl_container.begin();
+           it != lhs_stl_container.end(); ++it) {
+        if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
+            expected_.end()) {
+          if (printed_header) {
+            *os << ", ";
+          } else {
+            *os << "which has these unexpected elements: ";
+            printed_header = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+
+      // Now check for missing values.
+      bool printed_header2 = false;
+      for (typename StlContainer::const_iterator it = expected_.begin();
+           it != expected_.end(); ++it) {
+        if (internal::ArrayAwareFind(
+                lhs_stl_container.begin(), lhs_stl_container.end(), *it) ==
+            lhs_stl_container.end()) {
+          if (printed_header2) {
+            *os << ", ";
+          } else {
+            *os << (printed_header ? ",\nand" : "which")
+                << " doesn't have these expected elements: ";
+            printed_header2 = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+    }
+
+    return false;
+  }
+
+ private:
+  const StlContainer expected_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainerEqMatcher);
+};
+
+// A comparator functor that uses the < operator to compare two values.
+struct LessComparator {
+  template <typename T, typename U>
+  bool operator()(const T& lhs, const U& rhs) const { return lhs < rhs; }
+};
+
+// Implements WhenSortedBy(comparator, container_matcher).
+template <typename Comparator, typename ContainerMatcher>
+class WhenSortedByMatcher {
+ public:
+  WhenSortedByMatcher(const Comparator& comparator,
+                      const ContainerMatcher& matcher)
+      : comparator_(comparator), matcher_(matcher) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
+    // so that we can match associative containers.
+    typedef typename RemoveConstFromKey<
+        typename LhsStlContainer::value_type>::type LhsValue;
+
+    Impl(const Comparator& comparator, const ContainerMatcher& matcher)
+        : comparator_(comparator), matcher_(matcher) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "(when sorted) ";
+      matcher_.DescribeTo(os);
+    }
+
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "(when sorted) ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(LhsContainer lhs,
+                                 MatchResultListener* listener) const {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
+                                               lhs_stl_container.end());
+      ::std::sort(
+           sorted_container.begin(), sorted_container.end(), comparator_);
+
+      if (!listener->IsInterested()) {
+        // If the listener is not interested, we do not need to
+        // construct the inner explanation.
+        return matcher_.Matches(sorted_container);
+      }
+
+      *listener << "which is ";
+      UniversalPrint(sorted_container, listener->stream());
+      *listener << " when sorted";
+
+      StringMatchResultListener inner_listener;
+      const bool match = matcher_.MatchAndExplain(sorted_container,
+                                                  &inner_listener);
+      PrintIfNotEmpty(inner_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Comparator comparator_;
+    const Matcher<const ::std::vector<LhsValue>&> matcher_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+ private:
+  const Comparator comparator_;
+  const ContainerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(WhenSortedByMatcher);
+};
+
+// Implements Pointwise(tuple_matcher, rhs_container).  tuple_matcher
+// must be able to be safely cast to Matcher<tuple<const T1&, const
+// T2&> >, where T1 and T2 are the types of elements in the LHS
+// container and the RHS container respectively.
+template <typename TupleMatcher, typename RhsContainer>
+class PointwiseMatcher {
+ public:
+  typedef internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type RhsValue;
+
+  // Like ContainerEq, we make a copy of rhs in case the elements in
+  // it are modified after this matcher is created.
+  PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
+      : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {
+    // Makes sure the user doesn't instantiate this class template
+    // with a const or reference type.
+    (void)testing::StaticAssertTypeEq<RhsContainer,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>();
+  }
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    return MakeMatcher(new Impl<LhsContainer>(tuple_matcher_, rhs_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    typedef typename LhsStlContainer::value_type LhsValue;
+    // We pass the LHS value and the RHS value to the inner matcher by
+    // reference, as they may be expensive to copy.  We must use tuple
+    // instead of pair here, as a pair cannot hold references (C++ 98,
+    // 20.2.2 [lib.pairs]).
+    typedef ::testing::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
+
+    Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
+        // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
+        : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+          rhs_(rhs) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "contains " << rhs_.size()
+          << " values, where each value and its corresponding value in ";
+      UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeTo(os);
+    }
+    virtual void DescribeNegationTo(::std::ostream* os) const {
+      *os << "doesn't contain exactly " << rhs_.size()
+          << " values, or contains a value x at some index i"
+          << " where x and the i-th value of ";
+      UniversalPrint(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeNegationTo(os);
+    }
+
+    virtual bool MatchAndExplain(LhsContainer lhs,
+                                 MatchResultListener* listener) const {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      const size_t actual_size = lhs_stl_container.size();
+      if (actual_size != rhs_.size()) {
+        *listener << "which contains " << actual_size << " values";
+        return false;
+      }
+
+      typename LhsStlContainer::const_iterator left = lhs_stl_container.begin();
+      typename RhsStlContainer::const_iterator right = rhs_.begin();
+      for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
+        const InnerMatcherArg value_pair(*left, *right);
+
+        if (listener->IsInterested()) {
+          StringMatchResultListener inner_listener;
+          if (!mono_tuple_matcher_.MatchAndExplain(
+                  value_pair, &inner_listener)) {
+            *listener << "where the value pair (";
+            UniversalPrint(*left, listener->stream());
+            *listener << ", ";
+            UniversalPrint(*right, listener->stream());
+            *listener << ") at index #" << i << " don't match";
+            PrintIfNotEmpty(inner_listener.str(), listener->stream());
+            return false;
+          }
+        } else {
+          if (!mono_tuple_matcher_.Matches(value_pair))
+            return false;
+        }
+      }
+
+      return true;
+    }
+
+   private:
+    const Matcher<InnerMatcherArg> mono_tuple_matcher_;
+    const RhsStlContainer rhs_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const TupleMatcher tuple_matcher_;
+  const RhsStlContainer rhs_;
+
+  GTEST_DISALLOW_ASSIGN_(PointwiseMatcher);
+};
+
+// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
+template <typename Container>
+class QuantifierMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InnerMatcher>
+  explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+           testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+
+  // Checks whether:
+  // * All elements in the container match, if all_elements_should_match.
+  // * Any element in the container matches, if !all_elements_should_match.
+  bool MatchAndExplainImpl(bool all_elements_should_match,
+                           Container container,
+                           MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    size_t i = 0;
+    for (typename StlContainer::const_iterator it = stl_container.begin();
+         it != stl_container.end(); ++it, ++i) {
+      StringMatchResultListener inner_listener;
+      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+
+      if (matches != all_elements_should_match) {
+        *listener << "whose element #" << i
+                  << (matches ? " matches" : " doesn't match");
+        PrintIfNotEmpty(inner_listener.str(), listener->stream());
+        return !all_elements_should_match;
+      }
+    }
+    return all_elements_should_match;
+  }
+
+ protected:
+  const Matcher<const Element&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(QuantifierMatcherImpl);
+};
+
+// Implements Contains(element_matcher) for the given argument type Container.
+// Symmetric to EachMatcherImpl.
+template <typename Container>
+class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "contains at least one element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't contain any element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  virtual bool MatchAndExplain(Container container,
+                               MatchResultListener* listener) const {
+    return this->MatchAndExplainImpl(false, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcherImpl);
+};
+
+// Implements Each(element_matcher) for the given argument type Container.
+// Symmetric to ContainsMatcherImpl.
+template <typename Container>
+class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit EachMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "only contains elements that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "contains some element that ";
+    this->inner_matcher_.DescribeNegationTo(os);
+  }
+
+  virtual bool MatchAndExplain(Container container,
+                               MatchResultListener* listener) const {
+    return this->MatchAndExplainImpl(true, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(EachMatcherImpl);
+};
+
+// Implements polymorphic Contains(element_matcher).
+template <typename M>
+class ContainsMatcher {
+ public:
+  explicit ContainsMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(new ContainsMatcherImpl<Container>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcher);
+};
+
+// Implements polymorphic Each(element_matcher).
+template <typename M>
+class EachMatcher {
+ public:
+  explicit EachMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(new EachMatcherImpl<Container>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(EachMatcher);
+};
+
+// Implements Key(inner_matcher) for the given argument pair type.
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename PairType>
+class KeyMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type KeyType;
+
+  template <typename InnerMatcher>
+  explicit KeyMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+          testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {
+  }
+
+  // Returns true iff 'key_value.first' (the key) matches the inner matcher.
+  virtual bool MatchAndExplain(PairType key_value,
+                               MatchResultListener* listener) const {
+    StringMatchResultListener inner_listener;
+    const bool match = inner_matcher_.MatchAndExplain(key_value.first,
+                                                      &inner_listener);
+    const internal::string explanation = inner_listener.str();
+    if (explanation != "") {
+      *listener << "whose first field is a value " << explanation;
+    }
+    return match;
+  }
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "has a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't have a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<const KeyType&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcherImpl);
+};
+
+// Implements polymorphic Key(matcher_for_key).
+template <typename M>
+class KeyMatcher {
+ public:
+  explicit KeyMatcher(M m) : matcher_for_key_(m) {}
+
+  template <typename PairType>
+  operator Matcher<PairType>() const {
+    return MakeMatcher(new KeyMatcherImpl<PairType>(matcher_for_key_));
+  }
+
+ private:
+  const M matcher_for_key_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcher);
+};
+
+// Implements Pair(first_matcher, second_matcher) for the given argument pair
+// type with its two matchers. See Pair() function below.
+template <typename PairType>
+class PairMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type FirstType;
+  typedef typename RawPairType::second_type SecondType;
+
+  template <typename FirstMatcher, typename SecondMatcher>
+  PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(
+            testing::SafeMatcherCast<const FirstType&>(first_matcher)),
+        second_matcher_(
+            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {
+  }
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    *os << "has a first field that ";
+    first_matcher_.DescribeTo(os);
+    *os << ", and has a second field that ";
+    second_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "has a first field that ";
+    first_matcher_.DescribeNegationTo(os);
+    *os << ", or has a second field that ";
+    second_matcher_.DescribeNegationTo(os);
+  }
+
+  // Returns true iff 'a_pair.first' matches first_matcher and 'a_pair.second'
+  // matches second_matcher.
+  virtual bool MatchAndExplain(PairType a_pair,
+                               MatchResultListener* listener) const {
+    if (!listener->IsInterested()) {
+      // If the listener is not interested, we don't need to construct the
+      // explanation.
+      return first_matcher_.Matches(a_pair.first) &&
+             second_matcher_.Matches(a_pair.second);
+    }
+    StringMatchResultListener first_inner_listener;
+    if (!first_matcher_.MatchAndExplain(a_pair.first,
+                                        &first_inner_listener)) {
+      *listener << "whose first field does not match";
+      PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
+      return false;
+    }
+    StringMatchResultListener second_inner_listener;
+    if (!second_matcher_.MatchAndExplain(a_pair.second,
+                                         &second_inner_listener)) {
+      *listener << "whose second field does not match";
+      PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
+      return false;
+    }
+    ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
+                   listener);
+    return true;
+  }
+
+ private:
+  void ExplainSuccess(const internal::string& first_explanation,
+                      const internal::string& second_explanation,
+                      MatchResultListener* listener) const {
+    *listener << "whose both fields match";
+    if (first_explanation != "") {
+      *listener << ", where the first field is a value " << first_explanation;
+    }
+    if (second_explanation != "") {
+      *listener << ", ";
+      if (first_explanation != "") {
+        *listener << "and ";
+      } else {
+        *listener << "where ";
+      }
+      *listener << "the second field is a value " << second_explanation;
+    }
+  }
+
+  const Matcher<const FirstType&> first_matcher_;
+  const Matcher<const SecondType&> second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcherImpl);
+};
+
+// Implements polymorphic Pair(first_matcher, second_matcher).
+template <typename FirstMatcher, typename SecondMatcher>
+class PairMatcher {
+ public:
+  PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
+
+  template <typename PairType>
+  operator Matcher<PairType> () const {
+    return MakeMatcher(
+        new PairMatcherImpl<PairType>(
+            first_matcher_, second_matcher_));
+  }
+
+ private:
+  const FirstMatcher first_matcher_;
+  const SecondMatcher second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcher);
+};
+
+// Implements ElementsAre() and ElementsAreArray().
+template <typename Container>
+class ElementsAreMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef decltype(std::begin(
+      std::declval<StlContainerReference>())) StlContainerConstIterator;
+  typedef typename std::remove_reference<decltype(
+      *std::declval<StlContainerConstIterator &>())>::type Element;
+
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename InputIter>
+  ElementsAreMatcherImpl(InputIter first, InputIter last) {
+    while (first != last) {
+      matchers_.push_back(MatcherCast<const Element&>(*first++));
+    }
+  }
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    if (count() == 0) {
+      *os << "is empty";
+    } else if (count() == 1) {
+      *os << "has 1 element that ";
+      matchers_[0].DescribeTo(os);
+    } else {
+      *os << "has " << Elements(count()) << " where\n";
+      for (size_t i = 0; i != count(); ++i) {
+        *os << "element #" << i << " ";
+        matchers_[i].DescribeTo(os);
+        if (i + 1 < count()) {
+          *os << ",\n";
+        }
+      }
+    }
+  }
+
+  // Describes what the negation of this matcher does.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    if (count() == 0) {
+      *os << "isn't empty";
+      return;
+    }
+
+    *os << "doesn't have " << Elements(count()) << ", or\n";
+    for (size_t i = 0; i != count(); ++i) {
+      *os << "element #" << i << " ";
+      matchers_[i].DescribeNegationTo(os);
+      if (i + 1 < count()) {
+        *os << ", or\n";
+      }
+    }
+  }
+
+  virtual bool MatchAndExplain(Container container,
+                               MatchResultListener* listener) const {
+    // To work with stream-like "containers", we must only walk
+    // through the elements in one pass.
+
+    const bool listener_interested = listener->IsInterested();
+
+    // explanations[i] is the explanation of the element at index i.
+    ::std::vector<internal::string> explanations(count());
+    StlContainerReference stl_container = View::ConstReference(container);
+    StlContainerConstIterator it = stl_container.begin();
+    size_t exam_pos = 0;
+    bool mismatch_found = false;  // Have we found a mismatched element yet?
+
+    // Go through the elements and matchers in pairs, until we reach
+    // the end of either the elements or the matchers, or until we find a
+    // mismatch.
+    for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
+      bool match;  // Does the current element match the current matcher?
+      if (listener_interested) {
+        StringMatchResultListener s;
+        match = matchers_[exam_pos].MatchAndExplain(*it, &s);
+        explanations[exam_pos] = s.str();
+      } else {
+        match = matchers_[exam_pos].Matches(*it);
+      }
+
+      if (!match) {
+        mismatch_found = true;
+        break;
+      }
+    }
+    // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
+
+    // Find how many elements the actual container has.  We avoid
+    // calling size() s.t. this code works for stream-like "containers"
+    // that don't define size().
+    size_t actual_count = exam_pos;
+    for (; it != stl_container.end(); ++it) {
+      ++actual_count;
+    }
+
+    if (actual_count != count()) {
+      // The element count doesn't match.  If the container is empty,
+      // there's no need to explain anything as Google Mock already
+      // prints the empty container.  Otherwise we just need to show
+      // how many elements there actually are.
+      if (listener_interested && (actual_count != 0)) {
+        *listener << "which has " << Elements(actual_count);
+      }
+      return false;
+    }
+
+    if (mismatch_found) {
+      // The element count matches, but the exam_pos-th element doesn't match.
+      if (listener_interested) {
+        *listener << "whose element #" << exam_pos << " doesn't match";
+        PrintIfNotEmpty(explanations[exam_pos], listener->stream());
+      }
+      return false;
+    }
+
+    // Every element matches its expectation.  We need to explain why
+    // (the obvious ones can be skipped).
+    if (listener_interested) {
+      bool reason_printed = false;
+      for (size_t i = 0; i != count(); ++i) {
+        const internal::string& s = explanations[i];
+        if (!s.empty()) {
+          if (reason_printed) {
+            *listener << ",\nand ";
+          }
+          *listener << "whose element #" << i << " matches, " << s;
+          reason_printed = true;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  static Message Elements(size_t count) {
+    return Message() << count << (count == 1 ? " element" : " elements");
+  }
+
+  size_t count() const { return matchers_.size(); }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcherImpl);
+};
+
+// Connectivity matrix of (elements X matchers), in element-major order.
+// Initially, there are no edges.
+// Use NextGraph() to iterate over all possible edge configurations.
+// Use Randomize() to generate a random edge configuration.
+class GTEST_API_ MatchMatrix {
+ public:
+  MatchMatrix(size_t num_elements, size_t num_matchers)
+      : num_elements_(num_elements),
+        num_matchers_(num_matchers),
+        matched_(num_elements_* num_matchers_, 0) {
+  }
+
+  size_t LhsSize() const { return num_elements_; }
+  size_t RhsSize() const { return num_matchers_; }
+  bool HasEdge(size_t ilhs, size_t irhs) const {
+    return matched_[SpaceIndex(ilhs, irhs)] == 1;
+  }
+  void SetEdge(size_t ilhs, size_t irhs, bool b) {
+    matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
+  }
+
+  // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
+  // adds 1 to that number; returns false if incrementing the graph left it
+  // empty.
+  bool NextGraph();
+
+  void Randomize();
+
+  string DebugString() const;
+
+ private:
+  size_t SpaceIndex(size_t ilhs, size_t irhs) const {
+    return ilhs * num_matchers_ + irhs;
+  }
+
+  size_t num_elements_;
+  size_t num_matchers_;
+
+  // Each element is a char interpreted as bool. They are stored as a
+  // flattened array in lhs-major order, use 'SpaceIndex()' to translate
+  // a (ilhs, irhs) matrix coordinate into an offset.
+  ::std::vector<char> matched_;
+};
+
+typedef ::std::pair<size_t, size_t> ElementMatcherPair;
+typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
+
+// Returns a maximum bipartite matching for the specified graph 'g'.
+// The matching is represented as a vector of {element, matcher} pairs.
+GTEST_API_ ElementMatcherPairs
+FindMaxBipartiteMatching(const MatchMatrix& g);
+
+GTEST_API_ bool FindPairing(const MatchMatrix& matrix,
+                            MatchResultListener* listener);
+
+// Untyped base class for implementing UnorderedElementsAre.  By
+// putting logic that's not specific to the element type here, we
+// reduce binary bloat and increase compilation speed.
+class GTEST_API_ UnorderedElementsAreMatcherImplBase {
+ protected:
+  // A vector of matcher describers, one for each element matcher.
+  // Does not own the describers (and thus can be used only when the
+  // element matchers are alive).
+  typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
+
+  // Describes this UnorderedElementsAre matcher.
+  void DescribeToImpl(::std::ostream* os) const;
+
+  // Describes the negation of this UnorderedElementsAre matcher.
+  void DescribeNegationToImpl(::std::ostream* os) const;
+
+  bool VerifyAllElementsAndMatchersAreMatched(
+      const ::std::vector<string>& element_printouts,
+      const MatchMatrix& matrix,
+      MatchResultListener* listener) const;
+
+  MatcherDescriberVec& matcher_describers() {
+    return matcher_describers_;
+  }
+
+  static Message Elements(size_t n) {
+    return Message() << n << " element" << (n == 1 ? "" : "s");
+  }
+
+ private:
+  MatcherDescriberVec matcher_describers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImplBase);
+};
+
+// Implements unordered ElementsAre and unordered ElementsAreArray.
+template <typename Container>
+class UnorderedElementsAreMatcherImpl
+    : public MatcherInterface<Container>,
+      public UnorderedElementsAreMatcherImplBase {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef decltype(std::begin(
+      std::declval<StlContainerReference>())) StlContainerConstIterator;
+  typedef typename std::remove_reference<decltype(
+      *std::declval<StlContainerConstIterator &>())>::type Element;
+
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename InputIter>
+  UnorderedElementsAreMatcherImpl(InputIter first, InputIter last) {
+    for (; first != last; ++first) {
+      matchers_.push_back(MatcherCast<const Element&>(*first));
+      matcher_describers().push_back(matchers_.back().GetDescriber());
+    }
+  }
+
+  // Describes what this matcher does.
+  virtual void DescribeTo(::std::ostream* os) const {
+    return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
+  }
+
+  virtual bool MatchAndExplain(Container container,
+                               MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    ::std::vector<string> element_printouts;
+    MatchMatrix matrix = AnalyzeElements(stl_container.begin(),
+                                         stl_container.end(),
+                                         &element_printouts,
+                                         listener);
+
+    const size_t actual_count = matrix.LhsSize();
+    if (actual_count == 0 && matchers_.empty()) {
+      return true;
+    }
+    if (actual_count != matchers_.size()) {
+      // The element count doesn't match.  If the container is empty,
+      // there's no need to explain anything as Google Mock already
+      // prints the empty container. Otherwise we just need to show
+      // how many elements there actually are.
+      if (actual_count != 0 && listener->IsInterested()) {
+        *listener << "which has " << Elements(actual_count);
+      }
+      return false;
+    }
+
+    return VerifyAllElementsAndMatchersAreMatched(element_printouts,
+                                                  matrix, listener) &&
+           FindPairing(matrix, listener);
+  }
+
+ private:
+  typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+
+  template <typename ElementIter>
+  MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
+                              ::std::vector<string>* element_printouts,
+                              MatchResultListener* listener) const {
+    element_printouts->clear();
+    ::std::vector<char> did_match;
+    size_t num_elements = 0;
+    for (; elem_first != elem_last; ++num_elements, ++elem_first) {
+      if (listener->IsInterested()) {
+        element_printouts->push_back(PrintToString(*elem_first));
+      }
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        did_match.push_back(Matches(matchers_[irhs])(*elem_first));
+      }
+    }
+
+    MatchMatrix matrix(num_elements, matchers_.size());
+    ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
+    for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
+      }
+    }
+    return matrix;
+  }
+
+  MatcherVec matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImpl);
+};
+
+// Functor for use in TransformTuple.
+// Performs MatcherCast<Target> on an input argument of any type.
+template <typename Target>
+struct CastAndAppendTransform {
+  template <typename Arg>
+  Matcher<Target> operator()(const Arg& a) const {
+    return MatcherCast<Target>(a);
+  }
+};
+
+// Implements UnorderedElementsAre.
+template <typename MatcherTuple>
+class UnorderedElementsAreMatcher {
+ public:
+  explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
+      : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef internal::StlContainerView<RawContainer> View;
+    typedef typename View::const_reference StlContainerReference;
+    typedef decltype(std::begin(
+        std::declval<StlContainerReference>())) StlContainerConstIterator;
+    typedef typename std::remove_reference<decltype(
+        *std::declval<StlContainerConstIterator &>())>::type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::testing::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return MakeMatcher(new UnorderedElementsAreMatcherImpl<Container>(
+                           matchers.begin(), matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcher);
+};
+
+// Implements ElementsAre.
+template <typename MatcherTuple>
+class ElementsAreMatcher {
+ public:
+  explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef internal::StlContainerView<RawContainer> View;
+    typedef typename View::const_reference StlContainerReference;
+    typedef decltype(std::begin(
+        std::declval<StlContainerReference>())) StlContainerConstIterator;
+    typedef typename std::remove_reference<decltype(
+        *std::declval<StlContainerConstIterator &>())>::type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::testing::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return MakeMatcher(new ElementsAreMatcherImpl<Container>(
+                           matchers.begin(), matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcher);
+};
+
+// Implements UnorderedElementsAreArray().
+template <typename T>
+class UnorderedElementsAreArrayMatcher {
+ public:
+  UnorderedElementsAreArrayMatcher() {}
+
+  template <typename Iter>
+  UnorderedElementsAreArrayMatcher(Iter first, Iter last)
+      : matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(
+        new UnorderedElementsAreMatcherImpl<Container>(matchers_.begin(),
+                                                       matchers_.end()));
+  }
+
+ private:
+  ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreArrayMatcher);
+};
+
+// Implements ElementsAreArray().
+template <typename T>
+class ElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return MakeMatcher(new ElementsAreMatcherImpl<Container>(
+        matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreArrayMatcher);
+};
+
+// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
+// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
+// second) is a polymorphic matcher that matches a value x iff tm
+// matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+//
+// BoundSecondMatcher is copyable and assignable, as we need to put
+// instances of this class in a vector when implementing
+// UnorderedPointwise().
+template <typename Tuple2Matcher, typename Second>
+class BoundSecondMatcher {
+ public:
+  BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
+      : tuple2_matcher_(tm), second_value_(second) {}
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
+  }
+
+  // We have to define this for UnorderedPointwise() to compile in
+  // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
+  // which requires the elements to be assignable in C++98.  The
+  // compiler cannot generate the operator= for us, as Tuple2Matcher
+  // and Second may not be assignable.
+  //
+  // However, this should never be called, so the implementation just
+  // need to assert.
+  void operator=(const BoundSecondMatcher& /*rhs*/) {
+    GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
+  }
+
+ private:
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    typedef ::testing::tuple<T, Second> ArgTuple;
+
+    Impl(const Tuple2Matcher& tm, const Second& second)
+        : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
+          second_value_(second) {}
+
+    virtual void DescribeTo(::std::ostream* os) const {
+      *os << "and ";
+      UniversalPrint(second_value_, os);
+      *os << " ";
+      mono_tuple2_matcher_.DescribeTo(os);
+    }
+
+    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+      return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
+                                                  listener);
+    }
+
+   private:
+    const Matcher<const ArgTuple&> mono_tuple2_matcher_;
+    const Second second_value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const Tuple2Matcher tuple2_matcher_;
+  const Second second_value_;
+};
+
+// Given a 2-tuple matcher tm and a value second,
+// MatcherBindSecond(tm, second) returns a matcher that matches a
+// value x iff tm matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+template <typename Tuple2Matcher, typename Second>
+BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
+    const Tuple2Matcher& tm, const Second& second) {
+  return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ string FormatMatcherDescription(bool negation,
+                                           const char* matcher_name,
+                                           const Strings& param_values);
+
+}  // namespace internal
+
+// ElementsAreArray(first, last)
+// ElementsAreArray(pointer, count)
+// ElementsAreArray(array)
+// ElementsAreArray(container)
+// ElementsAreArray({ e1, e2, ..., en })
+//
+// The ElementsAreArray() functions are like ElementsAre(...), except
+// that they are given a homogeneous sequence rather than taking each
+// element as a function argument. The sequence can be specified as an
+// array, a pointer and count, a vector, an initializer list, or an
+// STL iterator range. In each of these cases, the underlying sequence
+// can be either a sequence of values or a sequence of matchers.
+//
+// All forms of ElementsAreArray() make a copy of the input matcher sequence.
+
+template <typename Iter>
+inline internal::ElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+ElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::ElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T* pointer, size_t count) {
+  return ElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T (&array)[N]) {
+  return ElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::ElementsAreArrayMatcher<typename Container::value_type>
+ElementsAreArray(const Container& container) {
+  return ElementsAreArray(container.begin(), container.end());
+}
+
+#if GTEST_HAS_STD_INITIALIZER_LIST_
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T>
+ElementsAreArray(::std::initializer_list<T> xs) {
+  return ElementsAreArray(xs.begin(), xs.end());
+}
+#endif
+
+// UnorderedElementsAreArray(first, last)
+// UnorderedElementsAreArray(pointer, count)
+// UnorderedElementsAreArray(array)
+// UnorderedElementsAreArray(container)
+// UnorderedElementsAreArray({ e1, e2, ..., en })
+//
+// The UnorderedElementsAreArray() functions are like
+// ElementsAreArray(...), but allow matching the elements in any order.
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+UnorderedElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T* pointer, size_t count) {
+  return UnorderedElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T (&array)[N]) {
+  return UnorderedElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+UnorderedElementsAreArray(const Container& container) {
+  return UnorderedElementsAreArray(container.begin(), container.end());
+}
+
+#if GTEST_HAS_STD_INITIALIZER_LIST_
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(::std::initializer_list<T> xs) {
+  return UnorderedElementsAreArray(xs.begin(), xs.end());
+}
+#endif
+
+// _ is a matcher that matches anything of any type.
+//
+// This definition is fine as:
+//
+//   1. The C++ standard permits using the name _ in a namespace that
+//      is not the global namespace or ::std.
+//   2. The AnythingMatcher class has no data member or constructor,
+//      so it's OK to create global variables of this type.
+//   3. c-style has approved of using _ in this case.
+const internal::AnythingMatcher _ = {};
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> A() { return MakeMatcher(new internal::AnyMatcherImpl<T>()); }
+
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> An() { return A<T>(); }
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+
+// Constructs a Matcher<T> from a 'value' of type T.  The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) { *this = Eq(value); }
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs.  A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice.  A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+  return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+  return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+  return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+  return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+  return internal::NeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches any NULL pointer.
+inline PolymorphicMatcher<internal::IsNullMatcher > IsNull() {
+  return MakePolymorphicMatcher(internal::IsNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any non-NULL pointer.
+// This is convenient as Not(NULL) doesn't compile (the compiler
+// thinks that that expression is comparing a pointer with an integer).
+inline PolymorphicMatcher<internal::NotNullMatcher > NotNull() {
+  return MakePolymorphicMatcher(internal::NotNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any argument that
+// references variable x.
+template <typename T>
+inline internal::RefMatcher<T&> Ref(T& x) {  // NOLINT
+  return internal::RefMatcher<T&>(x);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, false);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, true);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> DoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, false);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, true);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> FloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that points
+// to a value that matches inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointeeMatcher<InnerMatcher> Pointee(
+    const InnerMatcher& inner_matcher) {
+  return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
+}
+
+// Creates a matcher that matches a pointer or reference that matches
+// inner_matcher when dynamic_cast<To> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To> >
+WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
+  return MakePolymorphicMatcher(
+      internal::WhenDynamicCastToMatcher<To>(inner_matcher));
+}
+
+// Creates a matcher that matches an object whose given field matches
+// 'matcher'.  For example,
+//   Field(&Foo::number, Ge(5))
+// matches a Foo object x iff x.number >= 5.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<
+  internal::FieldMatcher<Class, FieldType> > Field(
+    FieldType Class::*field, const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::FieldMatcher<Class, FieldType>(
+          field, MatcherCast<const FieldType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Field(&Foo::bar, m)
+  // to compile where bar is an int32 and m is a matcher for int64.
+}
+
+// Creates a matcher that matches an object whose given property
+// matches 'matcher'.  For example,
+//   Property(&Foo::str, StartsWith("hi"))
+// matches a Foo object x iff x.str() starts with "hi".
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<
+  internal::PropertyMatcher<Class, PropertyType> > Property(
+    PropertyType (Class::*property)() const, const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType>(
+          property,
+          MatcherCast<GTEST_REFERENCE_TO_CONST_(PropertyType)>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Property(&Foo::bar, m)
+  // to compile where bar() returns an int32 and m is a matcher for int64.
+}
+
+// Creates a matcher that matches an object iff the result of applying
+// a callable to x matches 'matcher'.
+// For example,
+//   ResultOf(f, StartsWith("hi"))
+// matches a Foo object x iff f(x) starts with "hi".
+// callable parameter can be a function, function pointer, or a functor.
+// Callable has to satisfy the following conditions:
+//   * It is required to keep no state affecting the results of
+//     the calls on it and make no assumptions about how many calls
+//     will be made. Any state it keeps must be protected from the
+//     concurrent access.
+//   * If it is a function object, it has to define type result_type.
+//     We recommend deriving your functor classes from std::unary_function.
+template <typename Callable, typename ResultOfMatcher>
+internal::ResultOfMatcher<Callable> ResultOf(
+    Callable callable, const ResultOfMatcher& matcher) {
+  return internal::ResultOfMatcher<Callable>(
+          callable,
+          MatcherCast<typename internal::CallableTraits<Callable>::ResultType>(
+              matcher));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   ResultOf(Function, m)
+  // to compile where Function() returns an int32 and m is a matcher for int64.
+}
+
+// String matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::string> >
+    StrEq(const internal::string& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::string>(
+      str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::string> >
+    StrNe(const internal::string& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::string>(
+      str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::string> >
+    StrCaseEq(const internal::string& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::string>(
+      str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::string> >
+    StrCaseNe(const internal::string& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::string>(
+      str, false, false));
+}
+
+// Creates a matcher that matches any string, std::string, or C string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<internal::string> >
+    HasSubstr(const internal::string& substring) {
+  return MakePolymorphicMatcher(internal::HasSubstrMatcher<internal::string>(
+      substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<internal::string> >
+    StartsWith(const internal::string& prefix) {
+  return MakePolymorphicMatcher(internal::StartsWithMatcher<internal::string>(
+      prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<internal::string> >
+    EndsWith(const internal::string& suffix) {
+  return MakePolymorphicMatcher(internal::EndsWithMatcher<internal::string>(
+      suffix));
+}
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::string& regex) {
+  return MatchesRegex(new internal::RE(regex));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::string& regex) {
+  return ContainsRegex(new internal::RE(regex));
+}
+
+#if GTEST_HAS_GLOBAL_WSTRING || GTEST_HAS_STD_WSTRING
+// Wide string matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::wstring> >
+    StrEq(const internal::wstring& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::wstring>(
+      str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::wstring> >
+    StrNe(const internal::wstring& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::wstring>(
+      str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::wstring> >
+    StrCaseEq(const internal::wstring& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::wstring>(
+      str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<internal::wstring> >
+    StrCaseNe(const internal::wstring& str) {
+  return MakePolymorphicMatcher(internal::StrEqualityMatcher<internal::wstring>(
+      str, false, false));
+}
+
+// Creates a matcher that matches any wstring, std::wstring, or C wide string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<internal::wstring> >
+    HasSubstr(const internal::wstring& substring) {
+  return MakePolymorphicMatcher(internal::HasSubstrMatcher<internal::wstring>(
+      substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<internal::wstring> >
+    StartsWith(const internal::wstring& prefix) {
+  return MakePolymorphicMatcher(internal::StartsWithMatcher<internal::wstring>(
+      prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<internal::wstring> >
+    EndsWith(const internal::wstring& suffix) {
+  return MakePolymorphicMatcher(internal::EndsWithMatcher<internal::wstring>(
+      suffix));
+}
+
+#endif  // GTEST_HAS_GLOBAL_WSTRING || GTEST_HAS_STD_WSTRING
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field == the second field.
+inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field >= the second field.
+inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field > the second field.
+inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field <= the second field.
+inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field < the second field.
+inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field != the second field.
+inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
+
+// Creates a matcher that matches any value of type T that m doesn't
+// match.
+template <typename InnerMatcher>
+inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
+  return internal::NotMatcher<InnerMatcher>(m);
+}
+
+// Returns a matcher that matches anything that satisfies the given
+// predicate.  The predicate can be any unary function or functor
+// whose return type can be implicitly converted to bool.
+template <typename Predicate>
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate> >
+Truly(Predicate pred) {
+  return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
+}
+
+// Returns a matcher that matches the container size. The container must
+// support both size() and size_type which all STL-like containers provide.
+// Note that the parameter 'size' can be a value of type size_type as well as
+// matcher. For instance:
+//   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
+//   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
+template <typename SizeMatcher>
+inline internal::SizeIsMatcher<SizeMatcher>
+SizeIs(const SizeMatcher& size_matcher) {
+  return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
+}
+
+// Returns a matcher that matches the distance between the container's begin()
+// iterator and its end() iterator, i.e. the size of the container. This matcher
+// can be used instead of SizeIs with containers such as std::forward_list which
+// do not implement size(). The container must provide const_iterator (with
+// valid iterator_traits), begin() and end().
+template <typename DistanceMatcher>
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher>
+BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
+  return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
+}
+
+// Returns a matcher that matches an equal container.
+// This matcher behaves like Eq(), but in the event of mismatch lists the
+// values that are included in one container but not the other. (Duplicate
+// values and order differences are not explained.)
+template <typename Container>
+inline PolymorphicMatcher<internal::ContainerEqMatcher<  // NOLINT
+                            GTEST_REMOVE_CONST_(Container)> >
+    ContainerEq(const Container& rhs) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes Container to be a const type sometimes.
+  typedef GTEST_REMOVE_CONST_(Container) RawContainer;
+  return MakePolymorphicMatcher(
+      internal::ContainerEqMatcher<RawContainer>(rhs));
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the given comparator, matches container_matcher.
+template <typename Comparator, typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher>
+WhenSortedBy(const Comparator& comparator,
+             const ContainerMatcher& container_matcher) {
+  return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
+      comparator, container_matcher);
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the < operator, matches container_matcher.
+template <typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
+WhenSorted(const ContainerMatcher& container_matcher) {
+  return
+      internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>(
+          internal::LessComparator(), container_matcher);
+}
+
+// Matches an STL-style container or a native array that contains the
+// same number of elements as in rhs, where its i-th element and rhs's
+// i-th element (as a pair) satisfy the given pair matcher, for all i.
+// TupleMatcher must be able to be safely cast to Matcher<tuple<const
+// T1&, const T2&> >, where T1 and T2 are the types of elements in the
+// LHS container and the RHS container respectively.
+template <typename TupleMatcher, typename Container>
+inline internal::PointwiseMatcher<TupleMatcher,
+                                  GTEST_REMOVE_CONST_(Container)>
+Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes Container to be a const type sometimes (e.g. when
+  // rhs is a const int[])..
+  typedef GTEST_REMOVE_CONST_(Container) RawContainer;
+  return internal::PointwiseMatcher<TupleMatcher, RawContainer>(
+      tuple_matcher, rhs);
+}
+
+#if GTEST_HAS_STD_INITIALIZER_LIST_
+
+// Supports the Pointwise(m, {a, b, c}) syntax.
+template <typename TupleMatcher, typename T>
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T> > Pointwise(
+    const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
+  return Pointwise(tuple_matcher, std::vector<T>(rhs));
+}
+
+#endif  // GTEST_HAS_STD_INITIALIZER_LIST_
+
+// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
+// container or a native array that contains the same number of
+// elements as in rhs, where in some permutation of the container, its
+// i-th element and rhs's i-th element (as a pair) satisfy the given
+// pair matcher, for all i.  Tuple2Matcher must be able to be safely
+// cast to Matcher<tuple<const T1&, const T2&> >, where T1 and T2 are
+// the types of elements in the LHS container and the RHS container
+// respectively.
+//
+// This is like Pointwise(pair_matcher, rhs), except that the element
+// order doesn't matter.
+template <typename Tuple2Matcher, typename RhsContainer>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<
+        Tuple2Matcher, typename internal::StlContainerView<GTEST_REMOVE_CONST_(
+                           RhsContainer)>::type::value_type> >
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   const RhsContainer& rhs_container) {
+  // This following line is for working around a bug in MSVC 8.0,
+  // which causes RhsContainer to be a const type sometimes (e.g. when
+  // rhs_container is a const int[]).
+  typedef GTEST_REMOVE_CONST_(RhsContainer) RawRhsContainer;
+
+  // RhsView allows the same code to handle RhsContainer being a
+  // STL-style container and it being a native C-style array.
+  typedef typename internal::StlContainerView<RawRhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type Second;
+  const RhsStlContainer& rhs_stl_container =
+      RhsView::ConstReference(rhs_container);
+
+  // Create a matcher for each element in rhs_container.
+  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second> > matchers;
+  for (typename RhsStlContainer::const_iterator it = rhs_stl_container.begin();
+       it != rhs_stl_container.end(); ++it) {
+    matchers.push_back(
+        internal::MatcherBindSecond(tuple2_matcher, *it));
+  }
+
+  // Delegate the work to UnorderedElementsAreArray().
+  return UnorderedElementsAreArray(matchers);
+}
+
+#if GTEST_HAS_STD_INITIALIZER_LIST_
+
+// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
+template <typename Tuple2Matcher, typename T>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<Tuple2Matcher, T> >
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   std::initializer_list<T> rhs) {
+  return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
+}
+
+#endif  // GTEST_HAS_STD_INITIALIZER_LIST_
+
+// Matches an STL-style container or a native array that contains at
+// least one element matching the given value or matcher.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   page_ids.insert(3);
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Contains(1));
+//   EXPECT_THAT(page_ids, Contains(Gt(2)));
+//   EXPECT_THAT(page_ids, Not(Contains(4)));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   EXPECT_THAT(page_lengths,
+//               Contains(::std::pair<const int, size_t>(1, 100)));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+template <typename M>
+inline internal::ContainsMatcher<M> Contains(M matcher) {
+  return internal::ContainsMatcher<M>(matcher);
+}
+
+// Matches an STL-style container or a native array that contains only
+// elements matching the given value or matcher.
+//
+// Each(m) is semantically equivalent to Not(Contains(Not(m))). Only
+// the messages are different.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   // Each(m) matches an empty container, regardless of what m is.
+//   EXPECT_THAT(page_ids, Each(Eq(1)));
+//   EXPECT_THAT(page_ids, Each(Eq(77)));
+//
+//   page_ids.insert(3);
+//   EXPECT_THAT(page_ids, Each(Gt(0)));
+//   EXPECT_THAT(page_ids, Not(Each(Gt(4))));
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Not(Each(Lt(2))));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   page_lengths[2] = 200;
+//   page_lengths[3] = 300;
+//   EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
+//   EXPECT_THAT(page_lengths, Each(Key(Le(3))));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
+template <typename M>
+inline internal::EachMatcher<M> Each(M matcher) {
+  return internal::EachMatcher<M>(matcher);
+}
+
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename M>
+inline internal::KeyMatcher<M> Key(M inner_matcher) {
+  return internal::KeyMatcher<M>(inner_matcher);
+}
+
+// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
+// matches first_matcher and whose 'second' field matches second_matcher.  For
+// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
+// to match a std::map<int, string> that contains exactly one element whose key
+// is >= 5 and whose value equals "foo".
+template <typename FirstMatcher, typename SecondMatcher>
+inline internal::PairMatcher<FirstMatcher, SecondMatcher>
+Pair(FirstMatcher first_matcher, SecondMatcher second_matcher) {
+  return internal::PairMatcher<FirstMatcher, SecondMatcher>(
+      first_matcher, second_matcher);
+}
+
+// Returns a predicate that is satisfied by anything that matches the
+// given matcher.
+template <typename M>
+inline internal::MatcherAsPredicate<M> Matches(M matcher) {
+  return internal::MatcherAsPredicate<M>(matcher);
+}
+
+// Returns true iff the value matches the matcher.
+template <typename T, typename M>
+inline bool Value(const T& value, M matcher) {
+  return testing::Matches(matcher)(value);
+}
+
+// Matches the value against the given matcher and explains the match
+// result to listener.
+template <typename T, typename M>
+inline bool ExplainMatchResult(
+    M matcher, const T& value, MatchResultListener* listener) {
+  return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
+}
+
+#if GTEST_LANG_CXX11
+// Define variadic matcher versions. They are overloaded in
+// gmock-generated-matchers.h for the cases supported by pre C++11 compilers.
+template <typename... Args>
+inline internal::AllOfMatcher<Args...> AllOf(const Args&... matchers) {
+  return internal::AllOfMatcher<Args...>(matchers...);
+}
+
+template <typename... Args>
+inline internal::AnyOfMatcher<Args...> AnyOf(const Args&... matchers) {
+  return internal::AnyOfMatcher<Args...>(matchers...);
+}
+
+#endif  // GTEST_LANG_CXX11
+
+// AllArgs(m) is a synonym of m.  This is useful in
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
+//
+// which is easier to read than
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
+template <typename InnerMatcher>
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) { return matcher; }
+
+// These macros allow using matchers to check values in Google Test
+// tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
+// succeed iff the value matches the matcher.  If the assertion fails,
+// the value and the description of the matcher will be printed.
+#define ASSERT_THAT(value, matcher) ASSERT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) EXPECT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+}  // namespace testing
+
+// Include any custom callback matchers added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-matchers.h"
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-more-actions.h b/utils/unittest/googlemock/include/gmock/gmock-more-actions.h
new file mode 100644
index 000000000000..3d387b6b7d75
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-more-actions.h
@@ -0,0 +1,246 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some actions that depend on gmock-generated-actions.h.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+
+#include <algorithm>
+
+#include "gmock/gmock-generated-actions.h"
+
+namespace testing {
+namespace internal {
+
+// Implements the Invoke(f) action.  The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor.  Invoke(f) can be used as an
+// Action<F> as long as f's type is compatible with F (i.e. f can be
+// assigned to a tr1::function<F>).
+template <typename FunctionImpl>
+class InvokeAction {
+ public:
+  // The c'tor makes a copy of function_impl (either a function
+  // pointer or a functor).
+  explicit InvokeAction(FunctionImpl function_impl)
+      : function_impl_(function_impl) {}
+
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) {
+    return InvokeHelper<Result, ArgumentTuple>::Invoke(function_impl_, args);
+  }
+
+ private:
+  FunctionImpl function_impl_;
+
+  GTEST_DISALLOW_ASSIGN_(InvokeAction);
+};
+
+// Implements the Invoke(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+class InvokeMethodAction {
+ public:
+  InvokeMethodAction(Class* obj_ptr, MethodPtr method_ptr)
+      : method_ptr_(method_ptr), obj_ptr_(obj_ptr) {}
+
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) const {
+    return InvokeHelper<Result, ArgumentTuple>::InvokeMethod(
+        obj_ptr_, method_ptr_, args);
+  }
+
+ private:
+  // The order of these members matters.  Reversing the order can trigger
+  // warning C4121 in MSVC (see
+  // http://computer-programming-forum.com/7-vc.net/6fbc30265f860ad1.htm ).
+  const MethodPtr method_ptr_;
+  Class* const obj_ptr_;
+
+  GTEST_DISALLOW_ASSIGN_(InvokeMethodAction);
+};
+
+// An internal replacement for std::copy which mimics its behavior. This is
+// necessary because Visual Studio deprecates ::std::copy, issuing warning 4996.
+// However Visual Studio 2010 and later do not honor #pragmas which disable that
+// warning.
+template<typename InputIterator, typename OutputIterator>
+inline OutputIterator CopyElements(InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator output) {
+  for (; first != last; ++first, ++output) {
+    *output = *first;
+  }
+  return output;
+}
+
+}  // namespace internal
+
+// Various overloads for Invoke().
+
+// Creates an action that invokes 'function_impl' with the mock
+// function's arguments.
+template <typename FunctionImpl>
+PolymorphicAction<internal::InvokeAction<FunctionImpl> > Invoke(
+    FunctionImpl function_impl) {
+  return MakePolymorphicAction(
+      internal::InvokeAction<FunctionImpl>(function_impl));
+}
+
+// Creates an action that invokes the given method on the given object
+// with the mock function's arguments.
+template <class Class, typename MethodPtr>
+PolymorphicAction<internal::InvokeMethodAction<Class, MethodPtr> > Invoke(
+    Class* obj_ptr, MethodPtr method_ptr) {
+  return MakePolymorphicAction(
+      internal::InvokeMethodAction<Class, MethodPtr>(obj_ptr, method_ptr));
+}
+
+// WithoutArgs(inner_action) can be used in a mock function with a
+// non-empty argument list to perform inner_action, which takes no
+// argument.  In other words, it adapts an action accepting no
+// argument to one that accepts (and ignores) arguments.
+template <typename InnerAction>
+inline internal::WithArgsAction<InnerAction>
+WithoutArgs(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction>(action);
+}
+
+// WithArg<k>(an_action) creates an action that passes the k-th
+// (0-based) argument of the mock function to an_action and performs
+// it.  It adapts an action accepting one argument to one that accepts
+// multiple arguments.  For convenience, we also provide
+// WithArgs<k>(an_action) (defined below) as a synonym.
+template <int k, typename InnerAction>
+inline internal::WithArgsAction<InnerAction, k>
+WithArg(const InnerAction& action) {
+  return internal::WithArgsAction<InnerAction, k>(action);
+}
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Action ReturnArg<k>() returns the k-th argument of the mock function.
+ACTION_TEMPLATE(ReturnArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  return ::testing::get<k>(args);
+}
+
+// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
+// mock function to *pointer.
+ACTION_TEMPLATE(SaveArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = ::testing::get<k>(args);
+}
+
+// Action SaveArgPointee<k>(pointer) saves the value pointed to
+// by the k-th (0-based) argument of the mock function to *pointer.
+ACTION_TEMPLATE(SaveArgPointee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = *::testing::get<k>(args);
+}
+
+// Action SetArgReferee<k>(value) assigns 'value' to the variable
+// referenced by the k-th (0-based) argument of the mock function.
+ACTION_TEMPLATE(SetArgReferee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(value)) {
+  typedef typename ::testing::tuple_element<k, args_type>::type argk_type;
+  // Ensures that argument #k is a reference.  If you get a compiler
+  // error on the next line, you are using SetArgReferee<k>(value) in
+  // a mock function whose k-th (0-based) argument is not a reference.
+  GTEST_COMPILE_ASSERT_(internal::is_reference<argk_type>::value,
+                        SetArgReferee_must_be_used_with_a_reference_argument);
+  ::testing::get<k>(args) = value;
+}
+
+// Action SetArrayArgument<k>(first, last) copies the elements in
+// source range [first, last) to the array pointed to by the k-th
+// (0-based) argument, which can be either a pointer or an
+// iterator. The action does not take ownership of the elements in the
+// source range.
+ACTION_TEMPLATE(SetArrayArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(first, last)) {
+  // Visual Studio deprecates ::std::copy, so we use our own copy in that case.
+#ifdef _MSC_VER
+  internal::CopyElements(first, last, ::testing::get<k>(args));
+#else
+  ::std::copy(first, last, ::testing::get<k>(args));
+#endif
+}
+
+// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
+// function.
+ACTION_TEMPLATE(DeleteArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  delete ::testing::get<k>(args);
+}
+
+// This action returns the value pointed to by 'pointer'.
+ACTION_P(ReturnPointee, pointer) { return *pointer; }
+
+// Action Throw(exception) can be used in a mock function of any type
+// to throw the given exception.  Any copyable value can be thrown.
+#if GTEST_HAS_EXCEPTIONS
+
+// Suppresses the 'unreachable code' warning that VC generates in opt modes.
+# ifdef _MSC_VER
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4702)  // Temporarily disables warning 4702.
+# endif
+ACTION_P(Throw, exception) { throw exception; }
+# ifdef _MSC_VER
+#  pragma warning(pop)           // Restores the warning state.
+# endif
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-more-matchers.h b/utils/unittest/googlemock/include/gmock/gmock-more-matchers.h
new file mode 100644
index 000000000000..3db899f4297b
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-more-matchers.h
@@ -0,0 +1,58 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: marcus.boerger@google.com (Marcus Boerger)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some matchers that depend on gmock-generated-matchers.h.
+//
+// Note that tests are implemented in gmock-matchers_test.cc rather than
+// gmock-more-matchers-test.cc.
+
+#ifndef GMOCK_GMOCK_MORE_MATCHERS_H_
+#define GMOCK_GMOCK_MORE_MATCHERS_H_
+
+#include "gmock/gmock-generated-matchers.h"
+
+namespace testing {
+
+// Defines a matcher that matches an empty container. The container must
+// support both size() and empty(), which all STL-like containers provide.
+MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
+  if (arg.empty()) {
+    return true;
+  }
+  *result_listener << "whose size is " << arg.size();
+  return false;
+}
+
+}  // namespace testing
+
+#endif  // GMOCK_GMOCK_MORE_MATCHERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock-spec-builders.h b/utils/unittest/googlemock/include/gmock/gmock-spec-builders.h
new file mode 100644
index 000000000000..fed7de66bc4c
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock-spec-builders.h
@@ -0,0 +1,1847 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the ON_CALL() and EXPECT_CALL() macros.
+//
+// A user can use the ON_CALL() macro to specify the default action of
+// a mock method.  The syntax is:
+//
+//   ON_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matcher)
+//       .WillByDefault(action);
+//
+//  where the .With() clause is optional.
+//
+// A user can use the EXPECT_CALL() macro to specify an expectation on
+// a mock method.  The syntax is:
+//
+//   EXPECT_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matchers)
+//       .Times(cardinality)
+//       .InSequence(sequences)
+//       .After(expectations)
+//       .WillOnce(action)
+//       .WillRepeatedly(action)
+//       .RetiresOnSaturation();
+//
+// where all clauses are optional, and .InSequence()/.After()/
+// .WillOnce() can appear any number of times.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>  // NOLINT
+#endif
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// An abstract handle of an expectation.
+class Expectation;
+
+// A set of expectation handles.
+class ExpectationSet;
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// Implements a mock function.
+template <typename F> class FunctionMocker;
+
+// Base class for expectations.
+class ExpectationBase;
+
+// Implements an expectation.
+template <typename F> class TypedExpectation;
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester;
+
+// Base class for function mockers.
+template <typename F> class FunctionMockerBase;
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+//
+// The reason we don't use more fine-grained protection is: when a
+// mock function Foo() is called, it needs to consult its expectations
+// to see which one should be picked.  If another thread is allowed to
+// call a mock function (either Foo() or a different one) at the same
+// time, it could affect the "retired" attributes of Foo()'s
+// expectations when InSequence() is used, and thus affect which
+// expectation gets picked.  Therefore, we sequence all mock function
+// calls to ensure the integrity of the mock objects' states.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Untyped base class for ActionResultHolder<R>.
+class UntypedActionResultHolderBase;
+
+// Abstract base class of FunctionMockerBase.  This is the
+// type-agnostic part of the function mocker interface.  Its pure
+// virtual methods are implemented by FunctionMockerBase.
+class GTEST_API_ UntypedFunctionMockerBase {
+ public:
+  UntypedFunctionMockerBase();
+  virtual ~UntypedFunctionMockerBase();
+
+  // Verifies that all expectations on this mock function have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  bool VerifyAndClearExpectationsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Clears the ON_CALL()s set on this mock function.
+  virtual void ClearDefaultActionsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
+
+  // In all of the following Untyped* functions, it's the caller's
+  // responsibility to guarantee the correctness of the arguments'
+  // types.
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      const void* untyped_args,
+      const string& call_description) const = 0;
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action,
+      const void* untyped_args) const = 0;
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  virtual void UntypedDescribeUninterestingCall(
+      const void* untyped_args,
+      ::std::ostream* os) const
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  virtual const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args,
+      const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Prints the given function arguments to the ostream.
+  virtual void UntypedPrintArgs(const void* untyped_args,
+                                ::std::ostream* os) const = 0;
+
+  // Sets the mock object this mock method belongs to, and registers
+  // this information in the global mock registry.  Will be called
+  // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+  // method.
+  // TODO(wan@google.com): rename to SetAndRegisterOwner().
+  void RegisterOwner(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Sets the mock object this mock method belongs to, and sets the
+  // name of the mock function.  Will be called upon each invocation
+  // of this mock function.
+  void SetOwnerAndName(const void* mock_obj, const char* name)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the mock object this mock method belongs to.  Must be
+  // called after RegisterOwner() or SetOwnerAndName() has been
+  // called.
+  const void* MockObject() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the name of this mock method.  Must be called after
+  // SetOwnerAndName() has been called.
+  const char* Name() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.  The caller is responsible for deleting the
+  // result.
+  UntypedActionResultHolderBase* UntypedInvokeWith(
+      const void* untyped_args)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ protected:
+  typedef std::vector<const void*> UntypedOnCallSpecs;
+
+  typedef std::vector<internal::linked_ptr<ExpectationBase> >
+  UntypedExpectations;
+
+  // Returns an Expectation object that references and co-owns exp,
+  // which must be an expectation on this mock function.
+  Expectation GetHandleOf(ExpectationBase* exp);
+
+  // Address of the mock object this mock method belongs to.  Only
+  // valid after this mock method has been called or
+  // ON_CALL/EXPECT_CALL has been invoked on it.
+  const void* mock_obj_;  // Protected by g_gmock_mutex.
+
+  // Name of the function being mocked.  Only valid after this mock
+  // method has been called.
+  const char* name_;  // Protected by g_gmock_mutex.
+
+  // All default action specs for this function mocker.
+  UntypedOnCallSpecs untyped_on_call_specs_;
+
+  // All expectations for this function mocker.
+  UntypedExpectations untyped_expectations_;
+};  // class UntypedFunctionMockerBase
+
+// Untyped base class for OnCallSpec<F>.
+class UntypedOnCallSpecBase {
+ public:
+  // The arguments are the location of the ON_CALL() statement.
+  UntypedOnCallSpecBase(const char* a_file, int a_line)
+      : file_(a_file), line_(a_line), last_clause_(kNone) {}
+
+  // Where in the source file was the default action spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+
+ protected:
+  // Gives each clause in the ON_CALL() statement a name.
+  enum Clause {
+    // Do not change the order of the enum members!  The run-time
+    // syntax checking relies on it.
+    kNone,
+    kWith,
+    kWillByDefault
+  };
+
+  // Asserts that the ON_CALL() statement has a certain property.
+  void AssertSpecProperty(bool property, const string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the ON_CALL() statement has a certain property.
+  void ExpectSpecProperty(bool property, const string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  const char* file_;
+  int line_;
+
+  // The last clause in the ON_CALL() statement as seen so far.
+  // Initially kNone and changes as the statement is parsed.
+  Clause last_clause_;
+};  // class UntypedOnCallSpecBase
+
+// This template class implements an ON_CALL spec.
+template <typename F>
+class OnCallSpec : public UntypedOnCallSpecBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+  // Constructs an OnCallSpec object from the information inside
+  // the parenthesis of an ON_CALL() statement.
+  OnCallSpec(const char* a_file, int a_line,
+             const ArgumentMatcherTuple& matchers)
+      : UntypedOnCallSpecBase(a_file, a_line),
+        matchers_(matchers),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that triggers a compiler
+        // bug in Symbian's C++ compiler (cannot decide between two
+        // overloaded constructors of Matcher<const ArgumentTuple&>).
+        extra_matcher_(A<const ArgumentTuple&>()) {
+  }
+
+  // Implements the .With() clause.
+  OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
+    // Makes sure this is called at most once.
+    ExpectSpecProperty(last_clause_ < kWith,
+                       ".With() cannot appear "
+                       "more than once in an ON_CALL().");
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    return *this;
+  }
+
+  // Implements the .WillByDefault() clause.
+  OnCallSpec& WillByDefault(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ < kWillByDefault,
+                       ".WillByDefault() must appear "
+                       "exactly once in an ON_CALL().");
+    last_clause_ = kWillByDefault;
+
+    ExpectSpecProperty(!action.IsDoDefault(),
+                       "DoDefault() cannot be used in ON_CALL().");
+    action_ = action;
+    return *this;
+  }
+
+  // Returns true iff the given arguments match the matchers.
+  bool Matches(const ArgumentTuple& args) const {
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns the action specified by the user.
+  const Action<F>& GetAction() const {
+    AssertSpecProperty(last_clause_ == kWillByDefault,
+                       ".WillByDefault() must appear exactly "
+                       "once in an ON_CALL().");
+    return action_;
+  }
+
+ private:
+  // The information in statement
+  //
+  //   ON_CALL(mock_object, Method(matchers))
+  //       .With(multi-argument-matcher)
+  //       .WillByDefault(action);
+  //
+  // is recorded in the data members like this:
+  //
+  //   source file that contains the statement => file_
+  //   line number of the statement            => line_
+  //   matchers                                => matchers_
+  //   multi-argument-matcher                  => extra_matcher_
+  //   action                                  => action_
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> action_;
+};  // class OnCallSpec
+
+// Possible reactions on uninteresting calls.
+enum CallReaction {
+  kAllow,
+  kWarn,
+  kFail,
+  kDefault = kWarn  // By default, warn about uninteresting calls.
+};
+
+}  // namespace internal
+
+// Utilities for manipulating mock objects.
+class GTEST_API_ Mock {
+ public:
+  // The following public methods can be called concurrently.
+
+  // Tells Google Mock to ignore mock_obj when checking for leaked
+  // mock objects.
+  static void AllowLeak(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies and clears all expectations on the given mock object.
+  // If the expectations aren't satisfied, generates one or more
+  // Google Test non-fatal failures and returns false.
+  static bool VerifyAndClearExpectations(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies all expectations on the given mock object and clears its
+  // default actions and expectations.  Returns true iff the
+  // verification was successful.
+  static bool VerifyAndClear(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ private:
+  friend class internal::UntypedFunctionMockerBase;
+
+  // Needed for a function mocker to register itself (so that we know
+  // how to clear a mock object).
+  template <typename F>
+  friend class internal::FunctionMockerBase;
+
+  template <typename M>
+  friend class NiceMock;
+
+  template <typename M>
+  friend class NaggyMock;
+
+  template <typename M>
+  friend class StrictMock;
+
+  // Tells Google Mock to allow uninteresting calls on the given mock
+  // object.
+  static void AllowUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to warn the user about uninteresting calls on
+  // the given mock object.
+  static void WarnUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to fail uninteresting calls on the given mock
+  // object.
+  static void FailUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock the given mock object is being destroyed and
+  // its entry in the call-reaction table should be removed.
+  static void UnregisterCallReaction(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns the reaction Google Mock will have on uninteresting calls
+  // made on the given mock object.
+  static internal::CallReaction GetReactionOnUninterestingCalls(
+      const void* mock_obj)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies that all expectations on the given mock object have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  static bool VerifyAndClearExpectationsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Clears all ON_CALL()s set on the given mock object.
+  static void ClearDefaultActionsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Registers a mock object and a mock method it owns.
+  static void Register(
+      const void* mock_obj,
+      internal::UntypedFunctionMockerBase* mocker)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock where in the source code mock_obj is used in an
+  // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+  // information helps the user identify which object it is.
+  static void RegisterUseByOnCallOrExpectCall(
+      const void* mock_obj, const char* file, int line)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Unregisters a mock method; removes the owning mock object from
+  // the registry when the last mock method associated with it has
+  // been unregistered.  This is called only in the destructor of
+  // FunctionMockerBase.
+  static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+};  // class Mock
+
+// An abstract handle of an expectation.  Useful in the .After()
+// clause of EXPECT_CALL() for setting the (partial) order of
+// expectations.  The syntax:
+//
+//   Expectation e1 = EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(e1)...;
+//
+// sets two expectations where the latter can only be matched after
+// the former has been satisfied.
+//
+// Notes:
+//   - This class is copyable and has value semantics.
+//   - Constness is shallow: a const Expectation object itself cannot
+//     be modified, but the mutable methods of the ExpectationBase
+//     object it references can be called via expectation_base().
+//   - The constructors and destructor are defined out-of-line because
+//     the Symbian WINSCW compiler wants to otherwise instantiate them
+//     when it sees this class definition, at which point it doesn't have
+//     ExpectationBase available yet, leading to incorrect destruction
+//     in the linked_ptr (or compilation errors if using a checking
+//     linked_ptr).
+class GTEST_API_ Expectation {
+ public:
+  // Constructs a null object that doesn't reference any expectation.
+  Expectation();
+
+  ~Expectation();
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   Expectation e = EXPECT_CALL(...);
+  // syntax.
+  //
+  // A TypedExpectation object stores its pre-requisites as
+  // Expectation objects, and needs to call the non-const Retire()
+  // method on the ExpectationBase objects they reference.  Therefore
+  // Expectation must receive a *non-const* reference to the
+  // ExpectationBase object.
+  Expectation(internal::ExpectationBase& exp);  // NOLINT
+
+  // The compiler-generated copy ctor and operator= work exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true iff rhs references the same expectation as this object does.
+  bool operator==(const Expectation& rhs) const {
+    return expectation_base_ == rhs.expectation_base_;
+  }
+
+  bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
+
+ private:
+  friend class ExpectationSet;
+  friend class Sequence;
+  friend class ::testing::internal::ExpectationBase;
+  friend class ::testing::internal::UntypedFunctionMockerBase;
+
+  template <typename F>
+  friend class ::testing::internal::FunctionMockerBase;
+
+  template <typename F>
+  friend class ::testing::internal::TypedExpectation;
+
+  // This comparator is needed for putting Expectation objects into a set.
+  class Less {
+   public:
+    bool operator()(const Expectation& lhs, const Expectation& rhs) const {
+      return lhs.expectation_base_.get() < rhs.expectation_base_.get();
+    }
+  };
+
+  typedef ::std::set<Expectation, Less> Set;
+
+  Expectation(
+      const internal::linked_ptr<internal::ExpectationBase>& expectation_base);
+
+  // Returns the expectation this object references.
+  const internal::linked_ptr<internal::ExpectationBase>&
+  expectation_base() const {
+    return expectation_base_;
+  }
+
+  // A linked_ptr that co-owns the expectation this handle references.
+  internal::linked_ptr<internal::ExpectationBase> expectation_base_;
+};
+
+// A set of expectation handles.  Useful in the .After() clause of
+// EXPECT_CALL() for setting the (partial) order of expectations.  The
+// syntax:
+//
+//   ExpectationSet es;
+//   es += EXPECT_CALL(...)...;
+//   es += EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(es)...;
+//
+// sets three expectations where the last one can only be matched
+// after the first two have both been satisfied.
+//
+// This class is copyable and has value semantics.
+class ExpectationSet {
+ public:
+  // A bidirectional iterator that can read a const element in the set.
+  typedef Expectation::Set::const_iterator const_iterator;
+
+  // An object stored in the set.  This is an alias of Expectation.
+  typedef Expectation::Set::value_type value_type;
+
+  // Constructs an empty set.
+  ExpectationSet() {}
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   ExpectationSet es = EXPECT_CALL(...);
+  // syntax.
+  ExpectationSet(internal::ExpectationBase& exp) {  // NOLINT
+    *this += Expectation(exp);
+  }
+
+  // This single-argument ctor implements implicit conversion from
+  // Expectation and thus must not be explicit.  This allows either an
+  // Expectation or an ExpectationSet to be used in .After().
+  ExpectationSet(const Expectation& e) {  // NOLINT
+    *this += e;
+  }
+
+  // The compiler-generator ctor and operator= works exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true iff rhs contains the same set of Expectation objects
+  // as this does.
+  bool operator==(const ExpectationSet& rhs) const {
+    return expectations_ == rhs.expectations_;
+  }
+
+  bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
+
+  // Implements the syntax
+  //   expectation_set += EXPECT_CALL(...);
+  ExpectationSet& operator+=(const Expectation& e) {
+    expectations_.insert(e);
+    return *this;
+  }
+
+  int size() const { return static_cast<int>(expectations_.size()); }
+
+  const_iterator begin() const { return expectations_.begin(); }
+  const_iterator end() const { return expectations_.end(); }
+
+ private:
+  Expectation::Set expectations_;
+};
+
+
+// Sequence objects are used by a user to specify the relative order
+// in which the expectations should match.  They are copyable (we rely
+// on the compiler-defined copy constructor and assignment operator).
+class GTEST_API_ Sequence {
+ public:
+  // Constructs an empty sequence.
+  Sequence() : last_expectation_(new Expectation) {}
+
+  // Adds an expectation to this sequence.  The caller must ensure
+  // that no other thread is accessing this Sequence object.
+  void AddExpectation(const Expectation& expectation) const;
+
+ private:
+  // The last expectation in this sequence.  We use a linked_ptr here
+  // because Sequence objects are copyable and we want the copies to
+  // be aliases.  The linked_ptr allows the copies to co-own and share
+  // the same Expectation object.
+  internal::linked_ptr<Expectation> last_expectation_;
+};  // class Sequence
+
+// An object of this type causes all EXPECT_CALL() statements
+// encountered in its scope to be put in an anonymous sequence.  The
+// work is done in the constructor and destructor.  You should only
+// create an InSequence object on the stack.
+//
+// The sole purpose for this class is to support easy definition of
+// sequential expectations, e.g.
+//
+//   {
+//     InSequence dummy;  // The name of the object doesn't matter.
+//
+//     // The following expectations must match in the order they appear.
+//     EXPECT_CALL(a, Bar())...;
+//     EXPECT_CALL(a, Baz())...;
+//     ...
+//     EXPECT_CALL(b, Xyz())...;
+//   }
+//
+// You can create InSequence objects in multiple threads, as long as
+// they are used to affect different mock objects.  The idea is that
+// each thread can create and set up its own mocks as if it's the only
+// thread.  However, for clarity of your tests we recommend you to set
+// up mocks in the main thread unless you have a good reason not to do
+// so.
+class GTEST_API_ InSequence {
+ public:
+  InSequence();
+  ~InSequence();
+ private:
+  bool sequence_created_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InSequence);  // NOLINT
+} GTEST_ATTRIBUTE_UNUSED_;
+
+namespace internal {
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Base class for implementing expectations.
+//
+// There are two reasons for having a type-agnostic base class for
+// Expectation:
+//
+//   1. We need to store collections of expectations of different
+//   types (e.g. all pre-requisites of a particular expectation, all
+//   expectations in a sequence).  Therefore these expectation objects
+//   must share a common base class.
+//
+//   2. We can avoid binary code bloat by moving methods not depending
+//   on the template argument of Expectation to the base class.
+//
+// This class is internal and mustn't be used by user code directly.
+class GTEST_API_ ExpectationBase {
+ public:
+  // source_text is the EXPECT_CALL(...) source that created this Expectation.
+  ExpectationBase(const char* file, int line, const string& source_text);
+
+  virtual ~ExpectationBase();
+
+  // Where in the source file was the expectation spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+  const char* source_text() const { return source_text_.c_str(); }
+  // Returns the cardinality specified in the expectation spec.
+  const Cardinality& cardinality() const { return cardinality_; }
+
+  // Describes the source file location of this expectation.
+  void DescribeLocationTo(::std::ostream* os) const {
+    *os << FormatFileLocation(file(), line()) << " ";
+  }
+
+  // Describes how many times a function call matching this
+  // expectation has occurred.
+  void DescribeCallCountTo(::std::ostream* os) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
+
+ protected:
+  friend class ::testing::Expectation;
+  friend class UntypedFunctionMockerBase;
+
+  enum Clause {
+    // Don't change the order of the enum members!
+    kNone,
+    kWith,
+    kTimes,
+    kInSequence,
+    kAfter,
+    kWillOnce,
+    kWillRepeatedly,
+    kRetiresOnSaturation
+  };
+
+  typedef std::vector<const void*> UntypedActions;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  virtual Expectation GetHandle() = 0;
+
+  // Asserts that the EXPECT_CALL() statement has the given property.
+  void AssertSpecProperty(bool property, const string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the EXPECT_CALL() statement has the given property.
+  void ExpectSpecProperty(bool property, const string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  // Explicitly specifies the cardinality of this expectation.  Used
+  // by the subclasses to implement the .Times() clause.
+  void SpecifyCardinality(const Cardinality& cardinality);
+
+  // Returns true iff the user specified the cardinality explicitly
+  // using a .Times().
+  bool cardinality_specified() const { return cardinality_specified_; }
+
+  // Sets the cardinality of this expectation spec.
+  void set_cardinality(const Cardinality& a_cardinality) {
+    cardinality_ = a_cardinality;
+  }
+
+  // The following group of methods should only be called after the
+  // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
+  // the current thread.
+
+  // Retires all pre-requisites of this expectation.
+  void RetireAllPreRequisites()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns true iff this expectation is retired.
+  bool is_retired() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return retired_;
+  }
+
+  // Retires this expectation.
+  void Retire()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    retired_ = true;
+  }
+
+  // Returns true iff this expectation is satisfied.
+  bool IsSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSatisfiedByCallCount(call_count_);
+  }
+
+  // Returns true iff this expectation is saturated.
+  bool IsSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true iff this expectation is over-saturated.
+  bool IsOverSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsOverSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true iff all pre-requisites of this expectation are satisfied.
+  bool AllPrerequisitesAreSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Adds unsatisfied pre-requisites of this expectation to 'result'.
+  void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns the number this expectation has been invoked.
+  int call_count() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return call_count_;
+  }
+
+  // Increments the number this expectation has been invoked.
+  void IncrementCallCount()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    call_count_++;
+  }
+
+  // Checks the action count (i.e. the number of WillOnce() and
+  // WillRepeatedly() clauses) against the cardinality if this hasn't
+  // been done before.  Prints a warning if there are too many or too
+  // few actions.
+  void CheckActionCountIfNotDone() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  friend class ::testing::Sequence;
+  friend class ::testing::internal::ExpectationTester;
+
+  template <typename Function>
+  friend class TypedExpectation;
+
+  // Implements the .Times() clause.
+  void UntypedTimes(const Cardinality& a_cardinality);
+
+  // This group of fields are part of the spec and won't change after
+  // an EXPECT_CALL() statement finishes.
+  const char* file_;          // The file that contains the expectation.
+  int line_;                  // The line number of the expectation.
+  const string source_text_;  // The EXPECT_CALL(...) source text.
+  // True iff the cardinality is specified explicitly.
+  bool cardinality_specified_;
+  Cardinality cardinality_;            // The cardinality of the expectation.
+  // The immediate pre-requisites (i.e. expectations that must be
+  // satisfied before this expectation can be matched) of this
+  // expectation.  We use linked_ptr in the set because we want an
+  // Expectation object to be co-owned by its FunctionMocker and its
+  // successors.  This allows multiple mock objects to be deleted at
+  // different times.
+  ExpectationSet immediate_prerequisites_;
+
+  // This group of fields are the current state of the expectation,
+  // and can change as the mock function is called.
+  int call_count_;  // How many times this expectation has been invoked.
+  bool retired_;    // True iff this expectation has retired.
+  UntypedActions untyped_actions_;
+  bool extra_matcher_specified_;
+  bool repeated_action_specified_;  // True if a WillRepeatedly() was specified.
+  bool retires_on_saturation_;
+  Clause last_clause_;
+  mutable bool action_count_checked_;  // Under mutex_.
+  mutable Mutex mutex_;  // Protects action_count_checked_.
+
+  GTEST_DISALLOW_ASSIGN_(ExpectationBase);
+};  // class ExpectationBase
+
+// Impements an expectation for the given function type.
+template <typename F>
+class TypedExpectation : public ExpectationBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+  typedef typename Function<F>::Result Result;
+
+  TypedExpectation(FunctionMockerBase<F>* owner,
+                   const char* a_file, int a_line, const string& a_source_text,
+                   const ArgumentMatcherTuple& m)
+      : ExpectationBase(a_file, a_line, a_source_text),
+        owner_(owner),
+        matchers_(m),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that triggers a compiler
+        // bug in Symbian's C++ compiler (cannot decide between two
+        // overloaded constructors of Matcher<const ArgumentTuple&>).
+        extra_matcher_(A<const ArgumentTuple&>()),
+        repeated_action_(DoDefault()) {}
+
+  virtual ~TypedExpectation() {
+    // Check the validity of the action count if it hasn't been done
+    // yet (for example, if the expectation was never used).
+    CheckActionCountIfNotDone();
+    for (UntypedActions::const_iterator it = untyped_actions_.begin();
+         it != untyped_actions_.end(); ++it) {
+      delete static_cast<const Action<F>*>(*it);
+    }
+  }
+
+  // Implements the .With() clause.
+  TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
+    if (last_clause_ == kWith) {
+      ExpectSpecProperty(false,
+                         ".With() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWith,
+                         ".With() must be the first "
+                         "clause in an EXPECT_CALL().");
+    }
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    extra_matcher_specified_ = true;
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(const Cardinality& a_cardinality) {
+    ExpectationBase::UntypedTimes(a_cardinality);
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(int n) {
+    return Times(Exactly(n));
+  }
+
+  // Implements the .InSequence() clause.
+  TypedExpectation& InSequence(const Sequence& s) {
+    ExpectSpecProperty(last_clause_ <= kInSequence,
+                       ".InSequence() cannot appear after .After(),"
+                       " .WillOnce(), .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kInSequence;
+
+    s.AddExpectation(GetHandle());
+    return *this;
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
+    return InSequence(s1).InSequence(s2);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3) {
+    return InSequence(s1, s2).InSequence(s3);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4) {
+    return InSequence(s1, s2, s3).InSequence(s4);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4,
+                               const Sequence& s5) {
+    return InSequence(s1, s2, s3, s4).InSequence(s5);
+  }
+
+  // Implements that .After() clause.
+  TypedExpectation& After(const ExpectationSet& s) {
+    ExpectSpecProperty(last_clause_ <= kAfter,
+                       ".After() cannot appear after .WillOnce(),"
+                       " .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kAfter;
+
+    for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
+      immediate_prerequisites_ += *it;
+    }
+    return *this;
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
+    return After(s1).After(s2);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3) {
+    return After(s1, s2).After(s3);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4) {
+    return After(s1, s2, s3).After(s4);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4,
+                          const ExpectationSet& s5) {
+    return After(s1, s2, s3, s4).After(s5);
+  }
+
+  // Implements the .WillOnce() clause.
+  TypedExpectation& WillOnce(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ <= kWillOnce,
+                       ".WillOnce() cannot appear after "
+                       ".WillRepeatedly() or .RetiresOnSaturation().");
+    last_clause_ = kWillOnce;
+
+    untyped_actions_.push_back(new Action<F>(action));
+    if (!cardinality_specified()) {
+      set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
+    }
+    return *this;
+  }
+
+  // Implements the .WillRepeatedly() clause.
+  TypedExpectation& WillRepeatedly(const Action<F>& action) {
+    if (last_clause_ == kWillRepeatedly) {
+      ExpectSpecProperty(false,
+                         ".WillRepeatedly() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWillRepeatedly,
+                         ".WillRepeatedly() cannot appear "
+                         "after .RetiresOnSaturation().");
+    }
+    last_clause_ = kWillRepeatedly;
+    repeated_action_specified_ = true;
+
+    repeated_action_ = action;
+    if (!cardinality_specified()) {
+      set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
+    }
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Implements the .RetiresOnSaturation() clause.
+  TypedExpectation& RetiresOnSaturation() {
+    ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
+                       ".RetiresOnSaturation() cannot appear "
+                       "more than once.");
+    last_clause_ = kRetiresOnSaturation;
+    retires_on_saturation_ = true;
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Returns the matchers for the arguments as specified inside the
+  // EXPECT_CALL() macro.
+  const ArgumentMatcherTuple& matchers() const {
+    return matchers_;
+  }
+
+  // Returns the matcher specified by the .With() clause.
+  const Matcher<const ArgumentTuple&>& extra_matcher() const {
+    return extra_matcher_;
+  }
+
+  // Returns the action specified by the .WillRepeatedly() clause.
+  const Action<F>& repeated_action() const { return repeated_action_; }
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) {
+    if (extra_matcher_specified_) {
+      *os << "    Expected args: ";
+      extra_matcher_.DescribeTo(os);
+      *os << "\n";
+    }
+  }
+
+ private:
+  template <typename Function>
+  friend class FunctionMockerBase;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  virtual Expectation GetHandle() {
+    return owner_->GetHandleOf(this);
+  }
+
+  // The following methods will be called only after the EXPECT_CALL()
+  // statement finishes and when the current thread holds
+  // g_gmock_mutex.
+
+  // Returns true iff this expectation matches the given arguments.
+  bool Matches(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns true iff this expectation should handle the given arguments.
+  bool ShouldHandleArguments(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // In case the action count wasn't checked when the expectation
+    // was defined (e.g. if this expectation has no WillRepeatedly()
+    // or RetiresOnSaturation() clause), we check it when the
+    // expectation is used for the first time.
+    CheckActionCountIfNotDone();
+    return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
+  }
+
+  // Describes the result of matching the arguments against this
+  // expectation to the given ostream.
+  void ExplainMatchResultTo(
+      const ArgumentTuple& args,
+      ::std::ostream* os) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    if (is_retired()) {
+      *os << "         Expected: the expectation is active\n"
+          << "           Actual: it is retired\n";
+    } else if (!Matches(args)) {
+      if (!TupleMatches(matchers_, args)) {
+        ExplainMatchFailureTupleTo(matchers_, args, os);
+      }
+      StringMatchResultListener listener;
+      if (!extra_matcher_.MatchAndExplain(args, &listener)) {
+        *os << "    Expected args: ";
+        extra_matcher_.DescribeTo(os);
+        *os << "\n           Actual: don't match";
+
+        internal::PrintIfNotEmpty(listener.str(), os);
+        *os << "\n";
+      }
+    } else if (!AllPrerequisitesAreSatisfied()) {
+      *os << "         Expected: all pre-requisites are satisfied\n"
+          << "           Actual: the following immediate pre-requisites "
+          << "are not satisfied:\n";
+      ExpectationSet unsatisfied_prereqs;
+      FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
+      int i = 0;
+      for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
+           it != unsatisfied_prereqs.end(); ++it) {
+        it->expectation_base()->DescribeLocationTo(os);
+        *os << "pre-requisite #" << i++ << "\n";
+      }
+      *os << "                   (end of pre-requisites)\n";
+    } else {
+      // This line is here just for completeness' sake.  It will never
+      // be executed as currently the ExplainMatchResultTo() function
+      // is called only when the mock function call does NOT match the
+      // expectation.
+      *os << "The call matches the expectation.\n";
+    }
+  }
+
+  // Returns the action that should be taken for the current invocation.
+  const Action<F>& GetCurrentAction(
+      const FunctionMockerBase<F>* mocker,
+      const ArgumentTuple& args) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const int count = call_count();
+    Assert(count >= 1, __FILE__, __LINE__,
+           "call_count() is <= 0 when GetCurrentAction() is "
+           "called - this should never happen.");
+
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    if (action_count > 0 && !repeated_action_specified_ &&
+        count > action_count) {
+      // If there is at least one WillOnce() and no WillRepeatedly(),
+      // we warn the user when the WillOnce() clauses ran out.
+      ::std::stringstream ss;
+      DescribeLocationTo(&ss);
+      ss << "Actions ran out in " << source_text() << "...\n"
+         << "Called " << count << " times, but only "
+         << action_count << " WillOnce()"
+         << (action_count == 1 ? " is" : "s are") << " specified - ";
+      mocker->DescribeDefaultActionTo(args, &ss);
+      Log(kWarning, ss.str(), 1);
+    }
+
+    return count <= action_count ?
+        *static_cast<const Action<F>*>(untyped_actions_[count - 1]) :
+        repeated_action();
+  }
+
+  // Given the arguments of a mock function call, if the call will
+  // over-saturate this expectation, returns the default action;
+  // otherwise, returns the next action in this expectation.  Also
+  // describes *what* happened to 'what', and explains *why* Google
+  // Mock does it to 'why'.  This method is not const as it calls
+  // IncrementCallCount().  A return value of NULL means the default
+  // action.
+  const Action<F>* GetActionForArguments(
+      const FunctionMockerBase<F>* mocker,
+      const ArgumentTuple& args,
+      ::std::ostream* what,
+      ::std::ostream* why)
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    if (IsSaturated()) {
+      // We have an excessive call.
+      IncrementCallCount();
+      *what << "Mock function called more times than expected - ";
+      mocker->DescribeDefaultActionTo(args, what);
+      DescribeCallCountTo(why);
+
+      // TODO(wan@google.com): allow the user to control whether
+      // unexpected calls should fail immediately or continue using a
+      // flag --gmock_unexpected_calls_are_fatal.
+      return NULL;
+    }
+
+    IncrementCallCount();
+    RetireAllPreRequisites();
+
+    if (retires_on_saturation_ && IsSaturated()) {
+      Retire();
+    }
+
+    // Must be done after IncrementCount()!
+    *what << "Mock function call matches " << source_text() <<"...\n";
+    return &(GetCurrentAction(mocker, args));
+  }
+
+  // All the fields below won't change once the EXPECT_CALL()
+  // statement finishes.
+  FunctionMockerBase<F>* const owner_;
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> repeated_action_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TypedExpectation);
+};  // class TypedExpectation
+
+// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
+// specifying the default behavior of, or expectation on, a mock
+// function.
+
+// Note: class MockSpec really belongs to the ::testing namespace.
+// However if we define it in ::testing, MSVC will complain when
+// classes in ::testing::internal declare it as a friend class
+// template.  To workaround this compiler bug, we define MockSpec in
+// ::testing::internal and import it into ::testing.
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const string& message);
+
+template <typename F>
+class MockSpec {
+ public:
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename internal::Function<F>::ArgumentMatcherTuple
+      ArgumentMatcherTuple;
+
+  // Constructs a MockSpec object, given the function mocker object
+  // that the spec is associated with.
+  explicit MockSpec(internal::FunctionMockerBase<F>* function_mocker)
+      : function_mocker_(function_mocker) {}
+
+  // Adds a new default action spec to the function mocker and returns
+  // the newly created spec.
+  internal::OnCallSpec<F>& InternalDefaultActionSetAt(
+      const char* file, int line, const char* obj, const char* call) {
+    LogWithLocation(internal::kInfo, file, line,
+        string("ON_CALL(") + obj + ", " + call + ") invoked");
+    return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
+  }
+
+  // Adds a new expectation spec to the function mocker and returns
+  // the newly created spec.
+  internal::TypedExpectation<F>& InternalExpectedAt(
+      const char* file, int line, const char* obj, const char* call) {
+    const string source_text(string("EXPECT_CALL(") + obj + ", " + call + ")");
+    LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
+    return function_mocker_->AddNewExpectation(
+        file, line, source_text, matchers_);
+  }
+
+ private:
+  template <typename Function>
+  friend class internal::FunctionMocker;
+
+  void SetMatchers(const ArgumentMatcherTuple& matchers) {
+    matchers_ = matchers;
+  }
+
+  // The function mocker that owns this spec.
+  internal::FunctionMockerBase<F>* const function_mocker_;
+  // The argument matchers specified in the spec.
+  ArgumentMatcherTuple matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(MockSpec);
+};  // class MockSpec
+
+// Wrapper type for generically holding an ordinary value or lvalue reference.
+// If T is not a reference type, it must be copyable or movable.
+// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
+// T is a move-only value type (which means that it will always be copyable
+// if the current platform does not support move semantics).
+//
+// The primary template defines handling for values, but function header
+// comments describe the contract for the whole template (including
+// specializations).
+template <typename T>
+class ReferenceOrValueWrapper {
+ public:
+  // Constructs a wrapper from the given value/reference.
+  explicit ReferenceOrValueWrapper(T value)
+      : value_(::testing::internal::move(value)) {
+  }
+
+  // Unwraps and returns the underlying value/reference, exactly as
+  // originally passed. The behavior of calling this more than once on
+  // the same object is unspecified.
+  T Unwrap() { return ::testing::internal::move(value_); }
+
+  // Provides nondestructive access to the underlying value/reference.
+  // Always returns a const reference (more precisely,
+  // const RemoveReference<T>&). The behavior of calling this after
+  // calling Unwrap on the same object is unspecified.
+  const T& Peek() const {
+    return value_;
+  }
+
+ private:
+  T value_;
+};
+
+// Specialization for lvalue reference types. See primary template
+// for documentation.
+template <typename T>
+class ReferenceOrValueWrapper<T&> {
+ public:
+  // Workaround for debatable pass-by-reference lint warning (c-library-team
+  // policy precludes NOLINT in this context)
+  typedef T& reference;
+  explicit ReferenceOrValueWrapper(reference ref)
+      : value_ptr_(&ref) {}
+  T& Unwrap() { return *value_ptr_; }
+  const T& Peek() const { return *value_ptr_; }
+
+ private:
+  T* value_ptr_;
+};
+
+// MSVC warns about using 'this' in base member initializer list, so
+// we need to temporarily disable the warning.  We have to do it for
+// the entire class to suppress the warning, even though it's about
+// the constructor only.
+
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4355)  // Temporarily disables warning 4355.
+#endif  // _MSV_VER
+
+// C++ treats the void type specially.  For example, you cannot define
+// a void-typed variable or pass a void value to a function.
+// ActionResultHolder<T> holds a value of type T, where T must be a
+// copyable type or void (T doesn't need to be default-constructable).
+// It hides the syntactic difference between void and other types, and
+// is used to unify the code for invoking both void-returning and
+// non-void-returning mock functions.
+
+// Untyped base class for ActionResultHolder<T>.
+class UntypedActionResultHolderBase {
+ public:
+  virtual ~UntypedActionResultHolderBase() {}
+
+  // Prints the held value as an action's result to os.
+  virtual void PrintAsActionResult(::std::ostream* os) const = 0;
+};
+
+// This generic definition is used when T is not void.
+template <typename T>
+class ActionResultHolder : public UntypedActionResultHolderBase {
+ public:
+  // Returns the held value. Must not be called more than once.
+  T Unwrap() {
+    return result_.Unwrap();
+  }
+
+  // Prints the held value as an action's result to os.
+  virtual void PrintAsActionResult(::std::ostream* os) const {
+    *os << "\n          Returns: ";
+    // T may be a reference type, so we don't use UniversalPrint().
+    UniversalPrinter<T>::Print(result_.Peek(), os);
+  }
+
+  // Performs the given mock function's default action and returns the
+  // result in a new-ed ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMockerBase<F>* func_mocker,
+      const typename Function<F>::ArgumentTuple& args,
+      const string& call_description) {
+    return new ActionResultHolder(Wrapper(
+        func_mocker->PerformDefaultAction(args, call_description)));
+  }
+
+  // Performs the given action and returns the result in a new-ed
+  // ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder*
+  PerformAction(const Action<F>& action,
+                const typename Function<F>::ArgumentTuple& args) {
+    return new ActionResultHolder(Wrapper(action.Perform(args)));
+  }
+
+ private:
+  typedef ReferenceOrValueWrapper<T> Wrapper;
+
+  explicit ActionResultHolder(Wrapper result)
+      : result_(::testing::internal::move(result)) {
+  }
+
+  Wrapper result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+// Specialization for T = void.
+template <>
+class ActionResultHolder<void> : public UntypedActionResultHolderBase {
+ public:
+  void Unwrap() { }
+
+  virtual void PrintAsActionResult(::std::ostream* /* os */) const {}
+
+  // Performs the given mock function's default action and returns ownership
+  // of an empty ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMockerBase<F>* func_mocker,
+      const typename Function<F>::ArgumentTuple& args,
+      const string& call_description) {
+    func_mocker->PerformDefaultAction(args, call_description);
+    return new ActionResultHolder;
+  }
+
+  // Performs the given action and returns ownership of an empty
+  // ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action,
+      const typename Function<F>::ArgumentTuple& args) {
+    action.Perform(args);
+    return new ActionResultHolder;
+  }
+
+ private:
+  ActionResultHolder() {}
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+// The base of the function mocker class for the given function type.
+// We put the methods in this class instead of its child to avoid code
+// bloat.
+template <typename F>
+class FunctionMockerBase : public UntypedFunctionMockerBase {
+ public:
+  typedef typename Function<F>::Result Result;
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+  FunctionMockerBase() : current_spec_(this) {}
+
+  // The destructor verifies that all expectations on this mock
+  // function have been satisfied.  If not, it will report Google Test
+  // non-fatal failures for the violations.
+  virtual ~FunctionMockerBase()
+        GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    MutexLock l(&g_gmock_mutex);
+    VerifyAndClearExpectationsLocked();
+    Mock::UnregisterLocked(this);
+    ClearDefaultActionsLocked();
+  }
+
+  // Returns the ON_CALL spec that matches this mock function with the
+  // given arguments; returns NULL if no matching ON_CALL is found.
+  // L = *
+  const OnCallSpec<F>* FindOnCallSpec(
+      const ArgumentTuple& args) const {
+    for (UntypedOnCallSpecs::const_reverse_iterator it
+             = untyped_on_call_specs_.rbegin();
+         it != untyped_on_call_specs_.rend(); ++it) {
+      const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
+      if (spec->Matches(args))
+        return spec;
+    }
+
+    return NULL;
+  }
+
+  // Performs the default action of this mock function on the given
+  // arguments and returns the result. Asserts (or throws if
+  // exceptions are enabled) with a helpful call descrption if there
+  // is no valid return value. This method doesn't depend on the
+  // mutable state of this object, and thus can be called concurrently
+  // without locking.
+  // L = *
+  Result PerformDefaultAction(const ArgumentTuple& args,
+                              const string& call_description) const {
+    const OnCallSpec<F>* const spec =
+        this->FindOnCallSpec(args);
+    if (spec != NULL) {
+      return spec->GetAction().Perform(args);
+    }
+    const string message = call_description +
+        "\n    The mock function has no default action "
+        "set, and its return type has no default value set.";
+#if GTEST_HAS_EXCEPTIONS
+    if (!DefaultValue<Result>::Exists()) {
+      throw std::runtime_error(message);
+    }
+#else
+    Assert(DefaultValue<Result>::Exists(), "", -1, message);
+#endif
+    return DefaultValue<Result>::Get();
+  }
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.  The caller is responsible for deleting the result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      const void* untyped_args,  // must point to an ArgumentTuple
+      const string& call_description) const {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformDefaultAction(this, args, call_description);
+  }
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.  The caller is responsible for deleting the
+  // result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, const void* untyped_args) const {
+    // Make a copy of the action before performing it, in case the
+    // action deletes the mock object (and thus deletes itself).
+    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformAction(action, args);
+  }
+
+  // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
+  // clears the ON_CALL()s set on this mock function.
+  virtual void ClearDefaultActionsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // Deleting our default actions may trigger other mock objects to be
+    // deleted, for example if an action contains a reference counted smart
+    // pointer to that mock object, and that is the last reference. So if we
+    // delete our actions within the context of the global mutex we may deadlock
+    // when this method is called again. Instead, make a copy of the set of
+    // actions to delete, clear our set within the mutex, and then delete the
+    // actions outside of the mutex.
+    UntypedOnCallSpecs specs_to_delete;
+    untyped_on_call_specs_.swap(specs_to_delete);
+
+    g_gmock_mutex.Unlock();
+    for (UntypedOnCallSpecs::const_iterator it =
+             specs_to_delete.begin();
+         it != specs_to_delete.end(); ++it) {
+      delete static_cast<const OnCallSpec<F>*>(*it);
+    }
+
+    // Lock the mutex again, since the caller expects it to be locked when we
+    // return.
+    g_gmock_mutex.Lock();
+  }
+
+ protected:
+  template <typename Function>
+  friend class MockSpec;
+
+  typedef ActionResultHolder<Result> ResultHolder;
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.
+  Result InvokeWith(const ArgumentTuple& args)
+        GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    scoped_ptr<ResultHolder> holder(
+        DownCast_<ResultHolder*>(this->UntypedInvokeWith(&args)));
+    return holder->Unwrap();
+  }
+
+  // Adds and returns a default action spec for this mock function.
+  OnCallSpec<F>& AddNewOnCallSpec(
+      const char* file, int line,
+      const ArgumentMatcherTuple& m)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
+    untyped_on_call_specs_.push_back(on_call_spec);
+    return *on_call_spec;
+  }
+
+  // Adds and returns an expectation spec for this mock function.
+  TypedExpectation<F>& AddNewExpectation(
+      const char* file,
+      int line,
+      const string& source_text,
+      const ArgumentMatcherTuple& m)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    TypedExpectation<F>* const expectation =
+        new TypedExpectation<F>(this, file, line, source_text, m);
+    const linked_ptr<ExpectationBase> untyped_expectation(expectation);
+    untyped_expectations_.push_back(untyped_expectation);
+
+    // Adds this expectation into the implicit sequence if there is one.
+    Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
+    if (implicit_sequence != NULL) {
+      implicit_sequence->AddExpectation(Expectation(untyped_expectation));
+    }
+
+    return *expectation;
+  }
+
+  // The current spec (either default action spec or expectation spec)
+  // being described on this function mocker.
+  MockSpec<F>& current_spec() { return current_spec_; }
+
+ private:
+  template <typename Func> friend class TypedExpectation;
+
+  // Some utilities needed for implementing UntypedInvokeWith().
+
+  // Describes what default action will be performed for the given
+  // arguments.
+  // L = *
+  void DescribeDefaultActionTo(const ArgumentTuple& args,
+                               ::std::ostream* os) const {
+    const OnCallSpec<F>* const spec = FindOnCallSpec(args);
+
+    if (spec == NULL) {
+      *os << (internal::type_equals<Result, void>::value ?
+              "returning directly.\n" :
+              "returning default value.\n");
+    } else {
+      *os << "taking default action specified at:\n"
+          << FormatFileLocation(spec->file(), spec->line()) << "\n";
+    }
+  }
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  virtual void UntypedDescribeUninterestingCall(
+      const void* untyped_args,
+      ::std::ostream* os) const
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    *os << "Uninteresting mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    *os << "    Function call: " << Name();
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  //
+  // Critical section: We must find the matching expectation and the
+  // corresponding action that needs to be taken in an ATOMIC
+  // transaction.  Otherwise another thread may call this mock
+  // method in the middle and mess up the state.
+  //
+  // However, performing the action has to be left out of the critical
+  // section.  The reason is that we have no control on what the
+  // action does (it can invoke an arbitrary user function or even a
+  // mock function) and excessive locking could cause a dead lock.
+  virtual const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args,
+      const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    MutexLock l(&g_gmock_mutex);
+    TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
+    if (exp == NULL) {  // A match wasn't found.
+      this->FormatUnexpectedCallMessageLocked(args, what, why);
+      return NULL;
+    }
+
+    // This line must be done before calling GetActionForArguments(),
+    // which will increment the call count for *exp and thus affect
+    // its saturation status.
+    *is_excessive = exp->IsSaturated();
+    const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
+    if (action != NULL && action->IsDoDefault())
+      action = NULL;  // Normalize "do default" to NULL.
+    *untyped_action = action;
+    return exp;
+  }
+
+  // Prints the given function arguments to the ostream.
+  virtual void UntypedPrintArgs(const void* untyped_args,
+                                ::std::ostream* os) const {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the arguments, or NULL if no
+  // expectation matches them.
+  TypedExpectation<F>* FindMatchingExpectationLocked(
+      const ArgumentTuple& args) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    for (typename UntypedExpectations::const_reverse_iterator it =
+             untyped_expectations_.rbegin();
+         it != untyped_expectations_.rend(); ++it) {
+      TypedExpectation<F>* const exp =
+          static_cast<TypedExpectation<F>*>(it->get());
+      if (exp->ShouldHandleArguments(args)) {
+        return exp;
+      }
+    }
+    return NULL;
+  }
+
+  // Returns a message that the arguments don't match any expectation.
+  void FormatUnexpectedCallMessageLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* os,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    *os << "\nUnexpected mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    PrintTriedExpectationsLocked(args, why);
+  }
+
+  // Prints a list of expectations that have been tried against the
+  // current mock function call.
+  void PrintTriedExpectationsLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const int count = static_cast<int>(untyped_expectations_.size());
+    *why << "Google Mock tried the following " << count << " "
+         << (count == 1 ? "expectation, but it didn't match" :
+             "expectations, but none matched")
+         << ":\n";
+    for (int i = 0; i < count; i++) {
+      TypedExpectation<F>* const expectation =
+          static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
+      *why << "\n";
+      expectation->DescribeLocationTo(why);
+      if (count > 1) {
+        *why << "tried expectation #" << i << ": ";
+      }
+      *why << expectation->source_text() << "...\n";
+      expectation->ExplainMatchResultTo(args, why);
+      expectation->DescribeCallCountTo(why);
+    }
+  }
+
+  // The current spec (either default action spec or expectation spec)
+  // being described on this function mocker.
+  MockSpec<F> current_spec_;
+
+  // There is no generally useful and implementable semantics of
+  // copying a mock object, so copying a mock is usually a user error.
+  // Thus we disallow copying function mockers.  If the user really
+  // wants to copy a mock object, he should implement his own copy
+  // operation, for example:
+  //
+  //   class MockFoo : public Foo {
+  //    public:
+  //     // Defines a copy constructor explicitly.
+  //     MockFoo(const MockFoo& src) {}
+  //     ...
+  //   };
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(FunctionMockerBase);
+};  // class FunctionMockerBase
+
+#ifdef _MSC_VER
+# pragma warning(pop)  // Restores the warning state.
+#endif  // _MSV_VER
+
+// Implements methods of FunctionMockerBase.
+
+// Verifies that all expectations on this mock function have been
+// satisfied.  Reports one or more Google Test non-fatal failures and
+// returns false if not.
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const string& msg);
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the MockSpec class template is
+// meant to be defined in the ::testing namespace.  The following line
+// is just a trick for working around a bug in MSVC 8.0, which cannot
+// handle it if we define MockSpec in ::testing.
+using internal::MockSpec;
+
+// Const(x) is a convenient function for obtaining a const reference
+// to x.  This is useful for setting expectations on an overloaded
+// const mock method, e.g.
+//
+//   class MockFoo : public FooInterface {
+//    public:
+//     MOCK_METHOD0(Bar, int());
+//     MOCK_CONST_METHOD0(Bar, int&());
+//   };
+//
+//   MockFoo foo;
+//   // Expects a call to non-const MockFoo::Bar().
+//   EXPECT_CALL(foo, Bar());
+//   // Expects a call to const MockFoo::Bar().
+//   EXPECT_CALL(Const(foo), Bar());
+template <typename T>
+inline const T& Const(const T& x) { return x; }
+
+// Constructs an Expectation object that references and co-owns exp.
+inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
+    : expectation_base_(exp.GetHandle().expectation_base()) {}
+
+}  // namespace testing
+
+// A separate macro is required to avoid compile errors when the name
+// of the method used in call is a result of macro expansion.
+// See CompilesWithMethodNameExpandedFromMacro tests in
+// internal/gmock-spec-builders_test.cc for more details.
+#define GMOCK_ON_CALL_IMPL_(obj, call) \
+    ((obj).gmock_##call).InternalDefaultActionSetAt(__FILE__, __LINE__, \
+                                                    #obj, #call)
+#define ON_CALL(obj, call) GMOCK_ON_CALL_IMPL_(obj, call)
+
+#define GMOCK_EXPECT_CALL_IMPL_(obj, call) \
+    ((obj).gmock_##call).InternalExpectedAt(__FILE__, __LINE__, #obj, #call)
+#define EXPECT_CALL(obj, call) GMOCK_EXPECT_CALL_IMPL_(obj, call)
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/gmock.h b/utils/unittest/googlemock/include/gmock/gmock.h
new file mode 100644
index 000000000000..6735c71bf8aa
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/gmock.h
@@ -0,0 +1,94 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This is the main header file a user should include.
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_H_
+
+// This file implements the following syntax:
+//
+//   ON_CALL(mock_object.Method(...))
+//     .With(...) ?
+//     .WillByDefault(...);
+//
+// where With() is optional and WillByDefault() must appear exactly
+// once.
+//
+//   EXPECT_CALL(mock_object.Method(...))
+//     .With(...) ?
+//     .Times(...) ?
+//     .InSequence(...) *
+//     .WillOnce(...) *
+//     .WillRepeatedly(...) ?
+//     .RetiresOnSaturation() ? ;
+//
+// where all clauses are optional and WillOnce() can be repeated.
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-generated-actions.h"
+#include "gmock/gmock-generated-function-mockers.h"
+#include "gmock/gmock-generated-nice-strict.h"
+#include "gmock/gmock-generated-matchers.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-actions.h"
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+
+// Declares Google Mock flags that we want a user to use programmatically.
+GMOCK_DECLARE_bool_(catch_leaked_mocks);
+GMOCK_DECLARE_string_(verbose);
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses the command line for the flags
+// that Google Mock recognizes.  Whenever a Google Mock flag is seen,
+// it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/utils/unittest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
new file mode 100644
index 000000000000..7dc3b1ad5416
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -0,0 +1,8 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-actions.h.pump
+// DO NOT EDIT BY HAND!!!
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/utils/unittest/googlemock/include/gmock/internal/custom/gmock-matchers.h b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-matchers.h
new file mode 100644
index 000000000000..f2efef91dbe5
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -0,0 +1,39 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ============================================================
+// An installation-specific extension point for gmock-matchers.h.
+// ============================================================
+//
+// Adds google3 callback support to CallableTraits.
+//
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_CALLBACK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_CALLBACK_MATCHERS_H_
+
+#endif  //  GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_CALLBACK_MATCHERS_H_
diff --git a/utils/unittest/googlemock/include/gmock/internal/custom/gmock-port.h b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-port.h
new file mode 100644
index 000000000000..9ce8bfe06bf1
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -0,0 +1,46 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+//   Flag related macros:
+//     GMOCK_DECLARE_bool_(name)
+//     GMOCK_DECLARE_int32_(name)
+//     GMOCK_DECLARE_string_(name)
+//     GMOCK_DEFINE_bool_(name, default_val, doc)
+//     GMOCK_DEFINE_int32_(name, default_val, doc)
+//     GMOCK_DEFINE_string_(name, default_val, doc)
+//
+// ** Custom implementation starts here **
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/utils/unittest/googlemock/include/gmock/internal/gmock-generated-internal-utils.h b/utils/unittest/googlemock/include/gmock/internal/gmock-generated-internal-utils.h
new file mode 100644
index 000000000000..7811e43f87c1
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/gmock-generated-internal-utils.h
@@ -0,0 +1,279 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-internal-utils.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file contains template meta-programming utility classes needed
+// for implementing Google Mock.
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_GENERATED_INTERNAL_UTILS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_GENERATED_INTERNAL_UTILS_H_
+
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+template <typename T>
+class Matcher;
+
+namespace internal {
+
+// An IgnoredValue object can be implicitly constructed from ANY value.
+// This is used in implementing the IgnoreResult(a) action.
+class IgnoredValue {
+ public:
+  // This constructor template allows any value to be implicitly
+  // converted to IgnoredValue.  The object has no data member and
+  // doesn't try to remember anything about the argument.  We
+  // deliberately omit the 'explicit' keyword in order to allow the
+  // conversion to be implicit.
+  template <typename T>
+  IgnoredValue(const T& /* ignored */) {}  // NOLINT(runtime/explicit)
+};
+
+// MatcherTuple<T>::type is a tuple type where each field is a Matcher
+// for the corresponding field in tuple type T.
+template <typename Tuple>
+struct MatcherTuple;
+
+template <>
+struct MatcherTuple< ::testing::tuple<> > {
+  typedef ::testing::tuple< > type;
+};
+
+template <typename A1>
+struct MatcherTuple< ::testing::tuple<A1> > {
+  typedef ::testing::tuple<Matcher<A1> > type;
+};
+
+template <typename A1, typename A2>
+struct MatcherTuple< ::testing::tuple<A1, A2> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2> > type;
+};
+
+template <typename A1, typename A2, typename A3>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>,
+      Matcher<A4> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5,
+    typename A6>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5, A6> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5>, Matcher<A6> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5,
+    typename A6, typename A7>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5, A6, A7> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5>, Matcher<A6>, Matcher<A7> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5,
+    typename A6, typename A7, typename A8>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5>, Matcher<A6>, Matcher<A7>, Matcher<A8> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5,
+    typename A6, typename A7, typename A8, typename A9>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5>, Matcher<A6>, Matcher<A7>, Matcher<A8>, Matcher<A9> > type;
+};
+
+template <typename A1, typename A2, typename A3, typename A4, typename A5,
+    typename A6, typename A7, typename A8, typename A9, typename A10>
+struct MatcherTuple< ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9,
+    A10> > {
+  typedef ::testing::tuple<Matcher<A1>, Matcher<A2>, Matcher<A3>, Matcher<A4>,
+      Matcher<A5>, Matcher<A6>, Matcher<A7>, Matcher<A8>, Matcher<A9>,
+      Matcher<A10> > type;
+};
+
+// Template struct Function<F>, where F must be a function type, contains
+// the following typedefs:
+//
+//   Result:               the function's return type.
+//   ArgumentN:            the type of the N-th argument, where N starts with 1.
+//   ArgumentTuple:        the tuple type consisting of all parameters of F.
+//   ArgumentMatcherTuple: the tuple type consisting of Matchers for all
+//                         parameters of F.
+//   MakeResultVoid:       the function type obtained by substituting void
+//                         for the return type of F.
+//   MakeResultIgnoredValue:
+//                         the function type obtained by substituting Something
+//                         for the return type of F.
+template <typename F>
+struct Function;
+
+template <typename R>
+struct Function<R()> {
+  typedef R Result;
+  typedef ::testing::tuple<> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid();
+  typedef IgnoredValue MakeResultIgnoredValue();
+};
+
+template <typename R, typename A1>
+struct Function<R(A1)>
+    : Function<R()> {
+  typedef A1 Argument1;
+  typedef ::testing::tuple<A1> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1);
+  typedef IgnoredValue MakeResultIgnoredValue(A1);
+};
+
+template <typename R, typename A1, typename A2>
+struct Function<R(A1, A2)>
+    : Function<R(A1)> {
+  typedef A2 Argument2;
+  typedef ::testing::tuple<A1, A2> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2);
+};
+
+template <typename R, typename A1, typename A2, typename A3>
+struct Function<R(A1, A2, A3)>
+    : Function<R(A1, A2)> {
+  typedef A3 Argument3;
+  typedef ::testing::tuple<A1, A2, A3> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4>
+struct Function<R(A1, A2, A3, A4)>
+    : Function<R(A1, A2, A3)> {
+  typedef A4 Argument4;
+  typedef ::testing::tuple<A1, A2, A3, A4> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5>
+struct Function<R(A1, A2, A3, A4, A5)>
+    : Function<R(A1, A2, A3, A4)> {
+  typedef A5 Argument5;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6>
+struct Function<R(A1, A2, A3, A4, A5, A6)>
+    : Function<R(A1, A2, A3, A4, A5)> {
+  typedef A6 Argument6;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5, A6> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5, A6);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5, A6);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7>
+struct Function<R(A1, A2, A3, A4, A5, A6, A7)>
+    : Function<R(A1, A2, A3, A4, A5, A6)> {
+  typedef A7 Argument7;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5, A6, A7> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5, A6, A7);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5, A6, A7);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8>
+struct Function<R(A1, A2, A3, A4, A5, A6, A7, A8)>
+    : Function<R(A1, A2, A3, A4, A5, A6, A7)> {
+  typedef A8 Argument8;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5, A6, A7, A8);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5, A6, A7, A8);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9>
+struct Function<R(A1, A2, A3, A4, A5, A6, A7, A8, A9)>
+    : Function<R(A1, A2, A3, A4, A5, A6, A7, A8)> {
+  typedef A9 Argument9;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5, A6, A7, A8, A9);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5, A6, A7, A8,
+      A9);
+};
+
+template <typename R, typename A1, typename A2, typename A3, typename A4,
+    typename A5, typename A6, typename A7, typename A8, typename A9,
+    typename A10>
+struct Function<R(A1, A2, A3, A4, A5, A6, A7, A8, A9, A10)>
+    : Function<R(A1, A2, A3, A4, A5, A6, A7, A8, A9)> {
+  typedef A10 Argument10;
+  typedef ::testing::tuple<A1, A2, A3, A4, A5, A6, A7, A8, A9,
+      A10> ArgumentTuple;
+  typedef typename MatcherTuple<ArgumentTuple>::type ArgumentMatcherTuple;
+  typedef void MakeResultVoid(A1, A2, A3, A4, A5, A6, A7, A8, A9, A10);
+  typedef IgnoredValue MakeResultIgnoredValue(A1, A2, A3, A4, A5, A6, A7, A8,
+      A9, A10);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_GENERATED_INTERNAL_UTILS_H_
diff --git a/utils/unittest/googlemock/include/gmock/internal/gmock-internal-utils.h b/utils/unittest/googlemock/include/gmock/internal/gmock-internal-utils.h
new file mode 100644
index 000000000000..e2ddb05c91d7
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -0,0 +1,511 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+
+#include "gmock/internal/gmock-generated-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ string ConvertIdentifierNameToWords(const char* id_name);
+
+// PointeeOf<Pointer>::type is the type of a value pointed to by a
+// Pointer, which can be either a smart pointer or a raw pointer.  The
+// following default implementation is for the case where Pointer is a
+// smart pointer.
+template <typename Pointer>
+struct PointeeOf {
+  // Smart pointer classes define type element_type as the type of
+  // their pointees.
+  typedef typename Pointer::element_type type;
+};
+// This specialization is for the raw pointer case.
+template <typename T>
+struct PointeeOf<T*> { typedef T type; };  // NOLINT
+
+// GetRawPointer(p) returns the raw pointer underlying p when p is a
+// smart pointer, or returns p itself when p is already a raw pointer.
+// The following default implementation is for the smart pointer case.
+template <typename Pointer>
+inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
+  return p.get();
+}
+// This overloaded version is for the raw pointer case.
+template <typename Element>
+inline Element* GetRawPointer(Element* p) { return p; }
+
+// This comparator allows linked_ptr to be stored in sets.
+template <typename T>
+struct LinkedPtrLessThan {
+  bool operator()(const ::testing::internal::linked_ptr<T>& lhs,
+                  const ::testing::internal::linked_ptr<T>& rhs) const {
+    return lhs.get() < rhs.get();
+  }
+};
+
+// Symbian compilation can be done with wchar_t being either a native
+// type or a typedef.  Using Google Mock with OpenC without wchar_t
+// should require the definition of _STLP_NO_WCHAR_T.
+//
+// MSVC treats wchar_t as a native type usually, but treats it as the
+// same as unsigned short when the compiler option /Zc:wchar_t- is
+// specified.  It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
+// is a native type.
+#if (GTEST_OS_SYMBIAN && defined(_STLP_NO_WCHAR_T)) || \
+    (defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED))
+// wchar_t is a typedef.
+#else
+# define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#endif
+
+// signed wchar_t and unsigned wchar_t are NOT in the C++ standard.
+// Using them is a bad practice and not portable.  So DON'T use them.
+//
+// Still, Google Mock is designed to work even if the user uses signed
+// wchar_t or unsigned wchar_t (obviously, assuming the compiler
+// supports them).
+//
+// To gcc,
+//   wchar_t == signed wchar_t != unsigned wchar_t == unsigned int
+#ifdef __GNUC__
+// signed/unsigned wchar_t are valid types.
+# define GMOCK_HAS_SIGNED_WCHAR_T_ 1
+#endif
+
+// In what follows, we use the term "kind" to indicate whether a type
+// is bool, an integer type (excluding bool), a floating-point type,
+// or none of them.  This categorization is useful for determining
+// when a matcher argument type can be safely converted to another
+// type in the implementation of SafeMatcherCast.
+enum TypeKind {
+  kBool, kInteger, kFloatingPoint, kOther
+};
+
+// KindOf<T>::value is the kind of type T.
+template <typename T> struct KindOf {
+  enum { value = kOther };  // The default kind.
+};
+
+// This macro declares that the kind of 'type' is 'kind'.
+#define GMOCK_DECLARE_KIND_(type, kind) \
+  template <> struct KindOf<type> { enum { value = kind }; }
+
+GMOCK_DECLARE_KIND_(bool, kBool);
+
+// All standard integer types.
+GMOCK_DECLARE_KIND_(char, kInteger);
+GMOCK_DECLARE_KIND_(signed char, kInteger);
+GMOCK_DECLARE_KIND_(unsigned char, kInteger);
+GMOCK_DECLARE_KIND_(short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(int, kInteger);
+GMOCK_DECLARE_KIND_(unsigned int, kInteger);
+GMOCK_DECLARE_KIND_(long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger);  // NOLINT
+
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DECLARE_KIND_(wchar_t, kInteger);
+#endif
+
+// Non-standard integer types.
+GMOCK_DECLARE_KIND_(Int64, kInteger);
+GMOCK_DECLARE_KIND_(UInt64, kInteger);
+
+// All standard floating-point types.
+GMOCK_DECLARE_KIND_(float, kFloatingPoint);
+GMOCK_DECLARE_KIND_(double, kFloatingPoint);
+GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
+
+#undef GMOCK_DECLARE_KIND_
+
+// Evaluates to the kind of 'type'.
+#define GMOCK_KIND_OF_(type) \
+  static_cast< ::testing::internal::TypeKind>( \
+      ::testing::internal::KindOf<type>::value)
+
+// Evaluates to true iff integer type T is signed.
+#define GMOCK_IS_SIGNED_(T) (static_cast<T>(-1) < 0)
+
+// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
+// is true iff arithmetic type From can be losslessly converted to
+// arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types, kFromKind is the kind of
+// From, and kToKind is the kind of To; the value is
+// implementation-defined when the above pre-condition is violated.
+template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
+struct LosslessArithmeticConvertibleImpl : public false_type {};
+
+// Converting bool to bool is lossless.
+template <>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kBool, bool>
+    : public true_type {};  // NOLINT
+
+// Converting bool to any integer type is lossless.
+template <typename To>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kInteger, To>
+    : public true_type {};  // NOLINT
+
+// Converting bool to any floating-point type is lossless.
+template <typename To>
+struct LosslessArithmeticConvertibleImpl<kBool, bool, kFloatingPoint, To>
+    : public true_type {};  // NOLINT
+
+// Converting an integer to bool is lossy.
+template <typename From>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kBool, bool>
+    : public false_type {};  // NOLINT
+
+// Converting an integer to another non-bool integer is lossless iff
+// the target type's range encloses the source type's range.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kInteger, To>
+    : public bool_constant<
+      // When converting from a smaller size to a larger size, we are
+      // fine as long as we are not converting from signed to unsigned.
+      ((sizeof(From) < sizeof(To)) &&
+       (!GMOCK_IS_SIGNED_(From) || GMOCK_IS_SIGNED_(To))) ||
+      // When converting between the same size, the signedness must match.
+      ((sizeof(From) == sizeof(To)) &&
+       (GMOCK_IS_SIGNED_(From) == GMOCK_IS_SIGNED_(To)))> {};  // NOLINT
+
+#undef GMOCK_IS_SIGNED_
+
+// Converting an integer to a floating-point type may be lossy, since
+// the format of a floating-point number is implementation-defined.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kInteger, From, kFloatingPoint, To>
+    : public false_type {};  // NOLINT
+
+// Converting a floating-point to bool is lossy.
+template <typename From>
+struct LosslessArithmeticConvertibleImpl<kFloatingPoint, From, kBool, bool>
+    : public false_type {};  // NOLINT
+
+// Converting a floating-point to an integer is lossy.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<kFloatingPoint, From, kInteger, To>
+    : public false_type {};  // NOLINT
+
+// Converting a floating-point to another floating-point is lossless
+// iff the target type is at least as big as the source type.
+template <typename From, typename To>
+struct LosslessArithmeticConvertibleImpl<
+  kFloatingPoint, From, kFloatingPoint, To>
+    : public bool_constant<sizeof(From) <= sizeof(To)> {};  // NOLINT
+
+// LosslessArithmeticConvertible<From, To>::value is true iff arithmetic
+// type From can be losslessly converted to arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types; the value is
+// implementation-defined when the above pre-condition is violated.
+template <typename From, typename To>
+struct LosslessArithmeticConvertible
+    : public LosslessArithmeticConvertibleImpl<
+  GMOCK_KIND_OF_(From), From, GMOCK_KIND_OF_(To), To> {};  // NOLINT
+
+// This interface knows how to report a Google Mock failure (either
+// non-fatal or fatal).
+class FailureReporterInterface {
+ public:
+  // The type of a failure (either non-fatal or fatal).
+  enum FailureType {
+    kNonfatal, kFatal
+  };
+
+  virtual ~FailureReporterInterface() {}
+
+  // Reports a failure that occurred at the given source file location.
+  virtual void ReportFailure(FailureType type, const char* file, int line,
+                             const string& message) = 0;
+};
+
+// Returns the failure reporter used by Google Mock.
+GTEST_API_ FailureReporterInterface* GetFailureReporter();
+
+// Asserts that condition is true; aborts the process with the given
+// message if condition is false.  We cannot use LOG(FATAL) or CHECK()
+// as Google Mock might be used to mock the log sink itself.  We
+// inline this function to prevent it from showing up in the stack
+// trace.
+inline void Assert(bool condition, const char* file, int line,
+                   const string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal,
+                                        file, line, msg);
+  }
+}
+inline void Assert(bool condition, const char* file, int line) {
+  Assert(condition, file, line, "Assertion failed.");
+}
+
+// Verifies that condition is true; generates a non-fatal failure if
+// condition is false.
+inline void Expect(bool condition, const char* file, int line,
+                   const string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
+                                        file, line, msg);
+  }
+}
+inline void Expect(bool condition, const char* file, int line) {
+  Expect(condition, file, line, "Expectation failed.");
+}
+
+// Severity level of a log.
+enum LogSeverity {
+  kInfo = 0,
+  kWarning = 1
+};
+
+// Valid values for the --gmock_verbose flag.
+
+// All logs (informational and warnings) are printed.
+const char kInfoVerbosity[] = "info";
+// Only warnings are printed.
+const char kWarningVerbosity[] = "warning";
+// No logs are printed.
+const char kErrorVerbosity[] = "error";
+
+// Returns true iff a log with the given severity is visible according
+// to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity);
+
+// Prints the given message to stdout iff 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity,
+                    const string& message,
+                    int stack_frames_to_skip);
+
+// TODO(wan@google.com): group all type utilities together.
+
+// Type traits.
+
+// is_reference<T>::value is non-zero iff T is a reference type.
+template <typename T> struct is_reference : public false_type {};
+template <typename T> struct is_reference<T&> : public true_type {};
+
+// type_equals<T1, T2>::value is non-zero iff T1 and T2 are the same type.
+template <typename T1, typename T2> struct type_equals : public false_type {};
+template <typename T> struct type_equals<T, T> : public true_type {};
+
+// remove_reference<T>::type removes the reference from type T, if any.
+template <typename T> struct remove_reference { typedef T type; };  // NOLINT
+template <typename T> struct remove_reference<T&> { typedef T type; }; // NOLINT
+
+// DecayArray<T>::type turns an array type U[N] to const U* and preserves
+// other types.  Useful for saving a copy of a function argument.
+template <typename T> struct DecayArray { typedef T type; };  // NOLINT
+template <typename T, size_t N> struct DecayArray<T[N]> {
+  typedef const T* type;
+};
+// Sometimes people use arrays whose size is not available at the use site
+// (e.g. extern const char kNamePrefix[]).  This specialization covers that
+// case.
+template <typename T> struct DecayArray<T[]> {
+  typedef const T* type;
+};
+
+// Disable MSVC warnings for infinite recursion, since in this case the
+// the recursion is unreachable.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4717)
+#endif
+
+// Invalid<T>() is usable as an expression of type T, but will terminate
+// the program with an assertion failure if actually run.  This is useful
+// when a value of type T is needed for compilation, but the statement
+// will not really be executed (or we don't care if the statement
+// crashes).
+template <typename T>
+inline T Invalid() {
+  Assert(false, "", -1, "Internal error: attempt to return invalid value");
+  // This statement is unreachable, and would never terminate even if it
+  // could be reached. It is provided only to placate compiler warnings
+  // about missing return statements.
+  return Invalid<T>();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+// Given a raw type (i.e. having no top-level reference or const
+// modifier) RawContainer that's either an STL-style container or a
+// native array, class StlContainerView<RawContainer> has the
+// following members:
+//
+//   - type is a type that provides an STL-style container view to
+//     (i.e. implements the STL container concept for) RawContainer;
+//   - const_reference is a type that provides a reference to a const
+//     RawContainer;
+//   - ConstReference(raw_container) returns a const reference to an STL-style
+//     container view to raw_container, which is a RawContainer.
+//   - Copy(raw_container) returns an STL-style container view of a
+//     copy of raw_container, which is a RawContainer.
+//
+// This generic version is used when RawContainer itself is already an
+// STL-style container.
+template <class RawContainer>
+class StlContainerView {
+ public:
+  typedef RawContainer type;
+  typedef const type& const_reference;
+
+  static const_reference ConstReference(const RawContainer& container) {
+    // Ensures that RawContainer is not a const type.
+    testing::StaticAssertTypeEq<RawContainer,
+        GTEST_REMOVE_CONST_(RawContainer)>();
+    return container;
+  }
+  static type Copy(const RawContainer& container) { return container; }
+};
+
+// This specialization is used when RawContainer is a native array type.
+template <typename Element, size_t N>
+class StlContainerView<Element[N]> {
+ public:
+  typedef GTEST_REMOVE_CONST_(Element) RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  // NativeArray<T> can represent a native array either by value or by
+  // reference (selected by a constructor argument), so 'const type'
+  // can be used to reference a const native array.  We cannot
+  // 'typedef const type& const_reference' here, as that would mean
+  // ConstReference() has to return a reference to a local variable.
+  typedef const type const_reference;
+
+  static const_reference ConstReference(const Element (&array)[N]) {
+    // Ensures that Element is not a const type.
+    testing::StaticAssertTypeEq<Element, RawElement>();
+#if GTEST_OS_SYMBIAN
+    // The Nokia Symbian compiler confuses itself in template instantiation
+    // for this call without the cast to Element*:
+    // function call '[testing::internal::NativeArray<char *>].NativeArray(
+    //     {lval} const char *[4], long, testing::internal::RelationToSource)'
+    //     does not match
+    // 'testing::internal::NativeArray<char *>::NativeArray(
+    //     char *const *, unsigned int, testing::internal::RelationToSource)'
+    // (instantiating: 'testing::internal::ContainsMatcherImpl
+    //     <const char * (&)[4]>::Matches(const char * (&)[4]) const')
+    // (instantiating: 'testing::internal::StlContainerView<char *[4]>::
+    //     ConstReference(const char * (&)[4])')
+    // (and though the N parameter type is mismatched in the above explicit
+    // conversion of it doesn't help - only the conversion of the array).
+    return type(const_cast<Element*>(&array[0]), N,
+                RelationToSourceReference());
+#else
+    return type(array, N, RelationToSourceReference());
+#endif  // GTEST_OS_SYMBIAN
+  }
+  static type Copy(const Element (&array)[N]) {
+#if GTEST_OS_SYMBIAN
+    return type(const_cast<Element*>(&array[0]), N, RelationToSourceCopy());
+#else
+    return type(array, N, RelationToSourceCopy());
+#endif  // GTEST_OS_SYMBIAN
+  }
+};
+
+// This specialization is used when RawContainer is a native array
+// represented as a (pointer, size) tuple.
+template <typename ElementPointer, typename Size>
+class StlContainerView< ::testing::tuple<ElementPointer, Size> > {
+ public:
+  typedef GTEST_REMOVE_CONST_(
+      typename internal::PointeeOf<ElementPointer>::type) RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  typedef const type const_reference;
+
+  static const_reference ConstReference(
+      const ::testing::tuple<ElementPointer, Size>& array) {
+    return type(get<0>(array), get<1>(array), RelationToSourceReference());
+  }
+  static type Copy(const ::testing::tuple<ElementPointer, Size>& array) {
+    return type(get<0>(array), get<1>(array), RelationToSourceCopy());
+  }
+};
+
+// The following specialization prevents the user from instantiating
+// StlContainer with a reference type.
+template <typename T> class StlContainerView<T&>;
+
+// A type transform to remove constness from the first part of a pair.
+// Pairs like that are used as the value_type of associative containers,
+// and this transform produces a similar but assignable pair.
+template <typename T>
+struct RemoveConstFromKey {
+  typedef T type;
+};
+
+// Partially specialized to remove constness from std::pair<const K, V>.
+template <typename K, typename V>
+struct RemoveConstFromKey<std::pair<const K, V> > {
+  typedef std::pair<K, V> type;
+};
+
+// Mapping from booleans to types. Similar to boost::bool_<kValue> and
+// std::integral_constant<bool, kValue>.
+template <bool kValue>
+struct BooleanConstant {};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
diff --git a/utils/unittest/googlemock/include/gmock/internal/gmock-port.h b/utils/unittest/googlemock/include/gmock/internal/gmock-port.h
new file mode 100644
index 000000000000..63f4a6802e8e
--- /dev/null
+++ b/utils/unittest/googlemock/include/gmock/internal/gmock-port.h
@@ -0,0 +1,91 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vadimb@google.com (Vadim Berman)
+//
+// Low-level types and utilities for porting Google Mock to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Mock MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Mock's public API and can be used by
+// code outside Google Mock.
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include <iostream>
+
+// Most of the utilities needed for porting Google Mock are also
+// required for Google Test and are defined in gtest-port.h.
+//
+// Note to maintainers: to reduce code duplication, prefer adding
+// portability utilities to Google Test's gtest-port.h instead of
+// here, as Google Mock depends on Google Test.  Only add a utility
+// here if it's truly specific to Google Mock.
+#include "gtest/internal/gtest-linked_ptr.h"
+#include "gtest/internal/gtest-port.h"
+#include "gmock/internal/custom/gmock-port.h"
+
+// To avoid conditional compilation everywhere, we make it
+// gmock-port.h's responsibility to #include the header implementing
+// tr1/tuple.  gmock-port.h does this via gtest-port.h, which is
+// guaranteed to pull in the tuple header.
+
+// For MS Visual C++, check the compiler version. At least VS 2003 is
+// required to compile Google Mock.
+#if defined(_MSC_VER) && _MSC_VER < 1310
+# error "At least Visual C++ 2003 (7.1) is required to compile Google Mock."
+#endif
+
+// Macro for referencing flags.  This is public as we want the user to
+// use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG(name) FLAGS_gmock_##name
+
+#if !defined(GMOCK_DECLARE_bool_)
+
+// Macros for declaring flags.
+#define GMOCK_DECLARE_bool_(name) extern GTEST_API_ bool GMOCK_FLAG(name)
+#define GMOCK_DECLARE_int32_(name) \
+    extern GTEST_API_ ::testing::internal::Int32 GMOCK_FLAG(name)
+#define GMOCK_DECLARE_string_(name) \
+    extern GTEST_API_ ::std::string GMOCK_FLAG(name)
+
+// Macros for defining flags.
+#define GMOCK_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GMOCK_FLAG(name) = (default_val)
+#define GMOCK_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GMOCK_FLAG(name) = (default_val)
+#define GMOCK_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val)
+
+#endif  // !defined(GMOCK_DECLARE_bool_)
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/utils/unittest/googlemock/src/gmock-all.cc b/utils/unittest/googlemock/src/gmock-all.cc
new file mode 100644
index 000000000000..7aebce7afefc
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock-all.cc
@@ -0,0 +1,47 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Mocking Framework (Google Mock)
+//
+// This file #includes all Google Mock implementation .cc files.  The
+// purpose is to allow a user to build Google Mock by compiling this
+// file alone.
+
+// This line ensures that gmock.h can be compiled on its own, even
+// when it's fused.
+#include "gmock/gmock.h"
+
+// The following lines pull in the real gmock *.cc files.
+#include "src/gmock-cardinalities.cc"
+#include "src/gmock-internal-utils.cc"
+#include "src/gmock-matchers.cc"
+#include "src/gmock-spec-builders.cc"
+#include "src/gmock.cc"
diff --git a/utils/unittest/googlemock/src/gmock-cardinalities.cc b/utils/unittest/googlemock/src/gmock-cardinalities.cc
new file mode 100644
index 000000000000..50ec7286eebe
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock-cardinalities.cc
@@ -0,0 +1,156 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements cardinalities.
+
+#include "gmock/gmock-cardinalities.h"
+
+#include <limits.h>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+namespace {
+
+// Implements the Between(m, n) cardinality.
+class BetweenCardinalityImpl : public CardinalityInterface {
+ public:
+  BetweenCardinalityImpl(int min, int max)
+      : min_(min >= 0 ? min : 0),
+        max_(max >= min_ ? max : min_) {
+    std::stringstream ss;
+    if (min < 0) {
+      ss << "The invocation lower bound must be >= 0, "
+         << "but is actually " << min << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (max < 0) {
+      ss << "The invocation upper bound must be >= 0, "
+         << "but is actually " << max << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (min > max) {
+      ss << "The invocation upper bound (" << max
+         << ") must be >= the invocation lower bound (" << min
+         << ").";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    }
+  }
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  virtual int ConservativeLowerBound() const { return min_; }
+  virtual int ConservativeUpperBound() const { return max_; }
+
+  virtual bool IsSatisfiedByCallCount(int call_count) const {
+    return min_ <= call_count && call_count <= max_;
+  }
+
+  virtual bool IsSaturatedByCallCount(int call_count) const {
+    return call_count >= max_;
+  }
+
+  virtual void DescribeTo(::std::ostream* os) const;
+
+ private:
+  const int min_;
+  const int max_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(BetweenCardinalityImpl);
+};
+
+// Formats "n times" in a human-friendly way.
+inline internal::string FormatTimes(int n) {
+  if (n == 1) {
+    return "once";
+  } else if (n == 2) {
+    return "twice";
+  } else {
+    std::stringstream ss;
+    ss << n << " times";
+    return ss.str();
+  }
+}
+
+// Describes the Between(m, n) cardinality in human-friendly text.
+void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
+  if (min_ == 0) {
+    if (max_ == 0) {
+      *os << "never called";
+    } else if (max_ == INT_MAX) {
+      *os << "called any number of times";
+    } else {
+      *os << "called at most " << FormatTimes(max_);
+    }
+  } else if (min_ == max_) {
+    *os << "called " << FormatTimes(min_);
+  } else if (max_ == INT_MAX) {
+    *os << "called at least " << FormatTimes(min_);
+  } else {
+    // 0 < min_ < max_ < INT_MAX
+    *os << "called between " << min_ << " and " << max_ << " times";
+  }
+}
+
+}  // Unnamed namespace
+
+// Describes the given call count to an ostream.
+void Cardinality::DescribeActualCallCountTo(int actual_call_count,
+                                            ::std::ostream* os) {
+  if (actual_call_count > 0) {
+    *os << "called " << FormatTimes(actual_call_count);
+  } else {
+    *os << "never called";
+  }
+}
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max) {
+  return Cardinality(new BetweenCardinalityImpl(min, max));
+}
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
+
+}  // namespace testing
diff --git a/utils/unittest/googlemock/src/gmock-internal-utils.cc b/utils/unittest/googlemock/src/gmock-internal-utils.cc
new file mode 100644
index 000000000000..fb5308018a71
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock-internal-utils.cc
@@ -0,0 +1,174 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <ctype.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ string ConvertIdentifierNameToWords(const char* id_name) {
+  string result;
+  char prev_char = '\0';
+  for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
+    // We don't care about the current locale as the input is
+    // guaranteed to be a valid C++ identifier name.
+    const bool starts_new_word = IsUpper(*p) ||
+        (!IsAlpha(prev_char) && IsLower(*p)) ||
+        (!IsDigit(prev_char) && IsDigit(*p));
+
+    if (IsAlNum(*p)) {
+      if (starts_new_word && result != "")
+        result += ' ';
+      result += ToLower(*p);
+    }
+  }
+  return result;
+}
+
+// This class reports Google Mock failures as Google Test failures.  A
+// user can define another class in a similar fashion if he intends to
+// use Google Mock with a testing framework other than Google Test.
+class GoogleTestFailureReporter : public FailureReporterInterface {
+ public:
+  virtual void ReportFailure(FailureType type, const char* file, int line,
+                             const string& message) {
+    AssertHelper(type == kFatal ?
+                 TestPartResult::kFatalFailure :
+                 TestPartResult::kNonFatalFailure,
+                 file,
+                 line,
+                 message.c_str()) = Message();
+    if (type == kFatal) {
+      posix::Abort();
+    }
+  }
+};
+
+// Returns the global failure reporter.  Will create a
+// GoogleTestFailureReporter and return it the first time called.
+GTEST_API_ FailureReporterInterface* GetFailureReporter() {
+  // Points to the global failure reporter used by Google Mock.  gcc
+  // guarantees that the following use of failure_reporter is
+  // thread-safe.  We may need to add additional synchronization to
+  // protect failure_reporter if we port Google Mock to other
+  // compilers.
+  static FailureReporterInterface* const failure_reporter =
+      new GoogleTestFailureReporter();
+  return failure_reporter;
+}
+
+// Protects global resources (stdout in particular) used by Log().
+static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
+
+// Returns true iff a log with the given severity is visible according
+// to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity) {
+  if (GMOCK_FLAG(verbose) == kInfoVerbosity) {
+    // Always show the log if --gmock_verbose=info.
+    return true;
+  } else if (GMOCK_FLAG(verbose) == kErrorVerbosity) {
+    // Always hide it if --gmock_verbose=error.
+    return false;
+  } else {
+    // If --gmock_verbose is neither "info" nor "error", we treat it
+    // as "warning" (its default value).
+    return severity == kWarning;
+  }
+}
+
+// Prints the given message to stdout iff 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity,
+                    const string& message,
+                    int stack_frames_to_skip) {
+  if (!LogIsVisible(severity))
+    return;
+
+  // Ensures that logs from different threads don't interleave.
+  MutexLock l(&g_log_mutex);
+
+  // "using ::std::cout;" doesn't work with Symbian's STLport, where cout is a
+  // macro.
+
+  if (severity == kWarning) {
+    // Prints a GMOCK WARNING marker to make the warnings easily searchable.
+    std::cout << "\nGMOCK WARNING:";
+  }
+  // Pre-pends a new-line to message if it doesn't start with one.
+  if (message.empty() || message[0] != '\n') {
+    std::cout << "\n";
+  }
+  std::cout << message;
+  if (stack_frames_to_skip >= 0) {
+#ifdef NDEBUG
+    // In opt mode, we have to be conservative and skip no stack frame.
+    const int actual_to_skip = 0;
+#else
+    // In dbg mode, we can do what the caller tell us to do (plus one
+    // for skipping this function's stack frame).
+    const int actual_to_skip = stack_frames_to_skip + 1;
+#endif  // NDEBUG
+
+    // Appends a new-line to message if it doesn't end with one.
+    if (!message.empty() && *message.rbegin() != '\n') {
+      std::cout << "\n";
+    }
+    std::cout << "Stack trace:\n"
+         << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+             ::testing::UnitTest::GetInstance(), actual_to_skip);
+  }
+  std::cout << ::std::flush;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/utils/unittest/googlemock/src/gmock-matchers.cc b/utils/unittest/googlemock/src/gmock-matchers.cc
new file mode 100644
index 000000000000..e7424510fca2
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock-matchers.cc
@@ -0,0 +1,498 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements Matcher<const string&>, Matcher<string>, and
+// utilities for defining matchers.
+
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-generated-matchers.h"
+
+#include <string.h>
+#include <sstream>
+#include <string>
+
+namespace testing {
+
+// Constructs a matcher that matches a const string& whose value is
+// equal to s.
+Matcher<const internal::string&>::Matcher(const internal::string& s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const string& whose value is
+// equal to s.
+Matcher<const internal::string&>::Matcher(const char* s) {
+  *this = Eq(internal::string(s));
+}
+
+// Constructs a matcher that matches a string whose value is equal to s.
+Matcher<internal::string>::Matcher(const internal::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a string whose value is equal to s.
+Matcher<internal::string>::Matcher(const char* s) {
+  *this = Eq(internal::string(s));
+}
+
+#if GTEST_HAS_STRING_PIECE_
+// Constructs a matcher that matches a const StringPiece& whose value is
+// equal to s.
+Matcher<const StringPiece&>::Matcher(const internal::string& s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const StringPiece& whose value is
+// equal to s.
+Matcher<const StringPiece&>::Matcher(const char* s) {
+  *this = Eq(internal::string(s));
+}
+
+// Constructs a matcher that matches a const StringPiece& whose value is
+// equal to s.
+Matcher<const StringPiece&>::Matcher(StringPiece s) {
+  *this = Eq(s.ToString());
+}
+
+// Constructs a matcher that matches a StringPiece whose value is equal to s.
+Matcher<StringPiece>::Matcher(const internal::string& s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a StringPiece whose value is equal to s.
+Matcher<StringPiece>::Matcher(const char* s) {
+  *this = Eq(internal::string(s));
+}
+
+// Constructs a matcher that matches a StringPiece whose value is equal to s.
+Matcher<StringPiece>::Matcher(StringPiece s) {
+  *this = Eq(s.ToString());
+}
+#endif  // GTEST_HAS_STRING_PIECE_
+
+namespace internal {
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ string JoinAsTuple(const Strings& fields) {
+  switch (fields.size()) {
+    case 0:
+      return "";
+    case 1:
+      return fields[0];
+    default:
+      string result = "(" + fields[0];
+      for (size_t i = 1; i < fields.size(); i++) {
+        result += ", ";
+        result += fields[i];
+      }
+      result += ")";
+      return result;
+  }
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ string FormatMatcherDescription(bool negation,
+                                           const char* matcher_name,
+                                           const Strings& param_values) {
+  string result = ConvertIdentifierNameToWords(matcher_name);
+  if (param_values.size() >= 1)
+    result += " " + JoinAsTuple(param_values);
+  return negation ? "not (" + result + ")" : result;
+}
+
+// FindMaxBipartiteMatching and its helper class.
+//
+// Uses the well-known Ford-Fulkerson max flow method to find a maximum
+// bipartite matching. Flow is considered to be from left to right.
+// There is an implicit source node that is connected to all of the left
+// nodes, and an implicit sink node that is connected to all of the
+// right nodes. All edges have unit capacity.
+//
+// Neither the flow graph nor the residual flow graph are represented
+// explicitly. Instead, they are implied by the information in 'graph' and
+// a vector<int> called 'left_' whose elements are initialized to the
+// value kUnused. This represents the initial state of the algorithm,
+// where the flow graph is empty, and the residual flow graph has the
+// following edges:
+//   - An edge from source to each left_ node
+//   - An edge from each right_ node to sink
+//   - An edge from each left_ node to each right_ node, if the
+//     corresponding edge exists in 'graph'.
+//
+// When the TryAugment() method adds a flow, it sets left_[l] = r for some
+// nodes l and r. This induces the following changes:
+//   - The edges (source, l), (l, r), and (r, sink) are added to the
+//     flow graph.
+//   - The same three edges are removed from the residual flow graph.
+//   - The reverse edges (l, source), (r, l), and (sink, r) are added
+//     to the residual flow graph, which is a directional graph
+//     representing unused flow capacity.
+//
+// When the method augments a flow (moving left_[l] from some r1 to some
+// other r2), this can be thought of as "undoing" the above steps with
+// respect to r1 and "redoing" them with respect to r2.
+//
+// It bears repeating that the flow graph and residual flow graph are
+// never represented explicitly, but can be derived by looking at the
+// information in 'graph' and in left_.
+//
+// As an optimization, there is a second vector<int> called right_ which
+// does not provide any new information. Instead, it enables more
+// efficient queries about edges entering or leaving the right-side nodes
+// of the flow or residual flow graphs. The following invariants are
+// maintained:
+//
+// left[l] == kUnused or right[left[l]] == l
+// right[r] == kUnused or left[right[r]] == r
+//
+// . [ source ]                                        .
+// .   |||                                             .
+// .   |||                                             .
+// .   ||\--> left[0]=1  ---\    right[0]=-1 ----\     .
+// .   ||                   |                    |     .
+// .   |\---> left[1]=-1    \--> right[1]=0  ---\|     .
+// .   |                                        ||     .
+// .   \----> left[2]=2  ------> right[2]=2  --\||     .
+// .                                           |||     .
+// .         elements           matchers       vvv     .
+// .                                         [ sink ]  .
+//
+// See Also:
+//   [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
+//       "Introduction to Algorithms (Second ed.)", pp. 651-664.
+//   [2] "Ford-Fulkerson algorithm", Wikipedia,
+//       'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
+class MaxBipartiteMatchState {
+ public:
+  explicit MaxBipartiteMatchState(const MatchMatrix& graph)
+      : graph_(&graph),
+        left_(graph_->LhsSize(), kUnused),
+        right_(graph_->RhsSize(), kUnused) {
+  }
+
+  // Returns the edges of a maximal match, each in the form {left, right}.
+  ElementMatcherPairs Compute() {
+    // 'seen' is used for path finding { 0: unseen, 1: seen }.
+    ::std::vector<char> seen;
+    // Searches the residual flow graph for a path from each left node to
+    // the sink in the residual flow graph, and if one is found, add flow
+    // to the graph. It's okay to search through the left nodes once. The
+    // edge from the implicit source node to each previously-visited left
+    // node will have flow if that left node has any path to the sink
+    // whatsoever. Subsequent augmentations can only add flow to the
+    // network, and cannot take away that previous flow unit from the source.
+    // Since the source-to-left edge can only carry one flow unit (or,
+    // each element can be matched to only one matcher), there is no need
+    // to visit the left nodes more than once looking for augmented paths.
+    // The flow is known to be possible or impossible by looking at the
+    // node once.
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      // Reset the path-marking vector and try to find a path from
+      // source to sink starting at the left_[ilhs] node.
+      GTEST_CHECK_(left_[ilhs] == kUnused)
+          << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
+      // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
+      seen.assign(graph_->RhsSize(), 0);
+      TryAugment(ilhs, &seen);
+    }
+    ElementMatcherPairs result;
+    for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
+      size_t irhs = left_[ilhs];
+      if (irhs == kUnused) continue;
+      result.push_back(ElementMatcherPair(ilhs, irhs));
+    }
+    return result;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  // Perform a depth-first search from left node ilhs to the sink.  If a
+  // path is found, flow is added to the network by linking the left and
+  // right vector elements corresponding each segment of the path.
+  // Returns true if a path to sink was found, which means that a unit of
+  // flow was added to the network. The 'seen' vector elements correspond
+  // to right nodes and are marked to eliminate cycles from the search.
+  //
+  // Left nodes will only be explored at most once because they
+  // are accessible from at most one right node in the residual flow
+  // graph.
+  //
+  // Note that left_[ilhs] is the only element of left_ that TryAugment will
+  // potentially transition from kUnused to another value. Any other
+  // left_ element holding kUnused before TryAugment will be holding it
+  // when TryAugment returns.
+  //
+  bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      if ((*seen)[irhs])
+        continue;
+      if (!graph_->HasEdge(ilhs, irhs))
+        continue;
+      // There's an available edge from ilhs to irhs.
+      (*seen)[irhs] = 1;
+      // Next a search is performed to determine whether
+      // this edge is a dead end or leads to the sink.
+      //
+      // right_[irhs] == kUnused means that there is residual flow from
+      // right node irhs to the sink, so we can use that to finish this
+      // flow path and return success.
+      //
+      // Otherwise there is residual flow to some ilhs. We push flow
+      // along that path and call ourselves recursively to see if this
+      // ultimately leads to sink.
+      if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
+        // Add flow from left_[ilhs] to right_[irhs].
+        left_[ilhs] = irhs;
+        right_[irhs] = ilhs;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const MatchMatrix* graph_;  // not owned
+  // Each element of the left_ vector represents a left hand side node
+  // (i.e. an element) and each element of right_ is a right hand side
+  // node (i.e. a matcher). The values in the left_ vector indicate
+  // outflow from that node to a node on the the right_ side. The values
+  // in the right_ indicate inflow, and specify which left_ node is
+  // feeding that right_ node, if any. For example, left_[3] == 1 means
+  // there's a flow from element #3 to matcher #1. Such a flow would also
+  // be redundantly represented in the right_ vector as right_[1] == 3.
+  // Elements of left_ and right_ are either kUnused or mutually
+  // referent. Mutually referent means that left_[right_[i]] = i and
+  // right_[left_[i]] = i.
+  ::std::vector<size_t> left_;
+  ::std::vector<size_t> right_;
+
+  GTEST_DISALLOW_ASSIGN_(MaxBipartiteMatchState);
+};
+
+const size_t MaxBipartiteMatchState::kUnused;
+
+GTEST_API_ ElementMatcherPairs
+FindMaxBipartiteMatching(const MatchMatrix& g) {
+  return MaxBipartiteMatchState(g).Compute();
+}
+
+static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
+                                     ::std::ostream* stream) {
+  typedef ElementMatcherPairs::const_iterator Iter;
+  ::std::ostream& os = *stream;
+  os << "{";
+  const char *sep = "";
+  for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
+    os << sep << "\n  ("
+       << "element #" << it->first << ", "
+       << "matcher #" << it->second << ")";
+    sep = ",";
+  }
+  os << "\n}";
+}
+
+// Tries to find a pairing, and explains the result.
+GTEST_API_ bool FindPairing(const MatchMatrix& matrix,
+                            MatchResultListener* listener) {
+  ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
+
+  size_t max_flow = matches.size();
+  bool result = (max_flow == matrix.RhsSize());
+
+  if (!result) {
+    if (listener->IsInterested()) {
+      *listener << "where no permutation of the elements can "
+                   "satisfy all matchers, and the closest match is "
+                << max_flow << " of " << matrix.RhsSize()
+                << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+
+  if (matches.size() > 1) {
+    if (listener->IsInterested()) {
+      const char *sep = "where:\n";
+      for (size_t mi = 0; mi < matches.size(); ++mi) {
+        *listener << sep << " - element #" << matches[mi].first
+                  << " is matched by matcher #" << matches[mi].second;
+        sep = ",\n";
+      }
+    }
+  }
+  return true;
+}
+
+bool MatchMatrix::NextGraph() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      if (!b) {
+        b = 1;
+        return true;
+      }
+      b = 0;
+    }
+  }
+  return false;
+}
+
+void MatchMatrix::Randomize() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      b = static_cast<char>(rand() & 1);  // NOLINT
+    }
+  }
+}
+
+string MatchMatrix::DebugString() const {
+  ::std::stringstream ss;
+  const char *sep = "";
+  for (size_t i = 0; i < LhsSize(); ++i) {
+    ss << sep;
+    for (size_t j = 0; j < RhsSize(); ++j) {
+      ss << HasEdge(i, j);
+    }
+    sep = ";";
+  }
+  return ss.str();
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
+    ::std::ostream* os) const {
+  if (matcher_describers_.empty()) {
+    *os << "is empty";
+    return;
+  }
+  if (matcher_describers_.size() == 1) {
+    *os << "has " << Elements(1) << " and that element ";
+    matcher_describers_[0]->DescribeTo(os);
+    return;
+  }
+  *os << "has " << Elements(matcher_describers_.size())
+      << " and there exists some permutation of elements such that:\n";
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep << " - element #" << i << " ";
+    matcher_describers_[i]->DescribeTo(os);
+    sep = ", and\n";
+  }
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
+    ::std::ostream* os) const {
+  if (matcher_describers_.empty()) {
+    *os << "isn't empty";
+    return;
+  }
+  if (matcher_describers_.size() == 1) {
+    *os << "doesn't have " << Elements(1)
+        << ", or has " << Elements(1) << " that ";
+    matcher_describers_[0]->DescribeNegationTo(os);
+    return;
+  }
+  *os << "doesn't have " << Elements(matcher_describers_.size())
+      << ", or there exists no permutation of elements such that:\n";
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep << " - element #" << i << " ";
+    matcher_describers_[i]->DescribeTo(os);
+    sep = ", and\n";
+  }
+}
+
+// Checks that all matchers match at least one element, and that all
+// elements match at least one matcher. This enables faster matching
+// and better error reporting.
+// Returns false, writing an explanation to 'listener', if and only
+// if the success criteria are not met.
+bool UnorderedElementsAreMatcherImplBase::
+VerifyAllElementsAndMatchersAreMatched(
+    const ::std::vector<string>& element_printouts,
+    const MatchMatrix& matrix,
+    MatchResultListener* listener) const {
+  bool result = true;
+  ::std::vector<char> element_matched(matrix.LhsSize(), 0);
+  ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
+
+  for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
+    for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
+      char matched = matrix.HasEdge(ilhs, irhs);
+      element_matched[ilhs] |= matched;
+      matcher_matched[irhs] |= matched;
+    }
+  }
+
+  {
+    const char* sep =
+        "where the following matchers don't match any elements:\n";
+    for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
+      if (matcher_matched[mi])
+        continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << sep << "matcher #" << mi << ": ";
+        matcher_describers_[mi]->DescribeTo(listener->stream());
+        sep = ",\n";
+      }
+    }
+  }
+
+  {
+    const char* sep =
+        "where the following elements don't match any matchers:\n";
+    const char* outer_sep = "";
+    if (!result) {
+      outer_sep = "\nand ";
+    }
+    for (size_t ei = 0; ei < element_matched.size(); ++ei) {
+      if (element_matched[ei])
+        continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << outer_sep << sep << "element #" << ei << ": "
+                  << element_printouts[ei];
+        sep = ",\n";
+        outer_sep = "";
+      }
+    }
+  }
+  return result;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/utils/unittest/googlemock/src/gmock-spec-builders.cc b/utils/unittest/googlemock/src/gmock-spec-builders.cc
new file mode 100644
index 000000000000..95513420707f
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock-spec-builders.cc
@@ -0,0 +1,823 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the spec builder syntax (ON_CALL and
+// EXPECT_CALL).
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <stdlib.h>
+#include <iostream>  // NOLINT
+#include <map>
+#include <set>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
+# include <unistd.h>  // NOLINT
+#endif
+
+namespace testing {
+namespace internal {
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const string& message) {
+  ::std::ostringstream s;
+  s << file << ":" << line << ": " << message << ::std::endl;
+  Log(severity, s.str(), 0);
+}
+
+// Constructs an ExpectationBase object.
+ExpectationBase::ExpectationBase(const char* a_file,
+                                 int a_line,
+                                 const string& a_source_text)
+    : file_(a_file),
+      line_(a_line),
+      source_text_(a_source_text),
+      cardinality_specified_(false),
+      cardinality_(Exactly(1)),
+      call_count_(0),
+      retired_(false),
+      extra_matcher_specified_(false),
+      repeated_action_specified_(false),
+      retires_on_saturation_(false),
+      last_clause_(kNone),
+      action_count_checked_(false) {}
+
+// Destructs an ExpectationBase object.
+ExpectationBase::~ExpectationBase() {}
+
+// Explicitly specifies the cardinality of this expectation.  Used by
+// the subclasses to implement the .Times() clause.
+void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
+  cardinality_specified_ = true;
+  cardinality_ = a_cardinality;
+}
+
+// Retires all pre-requisites of this expectation.
+void ExpectationBase::RetireAllPreRequisites()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  if (is_retired()) {
+    // We can take this short-cut as we never retire an expectation
+    // until we have retired all its pre-requisites.
+    return;
+  }
+
+  for (ExpectationSet::const_iterator it = immediate_prerequisites_.begin();
+       it != immediate_prerequisites_.end(); ++it) {
+    ExpectationBase* const prerequisite = it->expectation_base().get();
+    if (!prerequisite->is_retired()) {
+      prerequisite->RetireAllPreRequisites();
+      prerequisite->Retire();
+    }
+  }
+}
+
+// Returns true iff all pre-requisites of this expectation have been
+// satisfied.
+bool ExpectationBase::AllPrerequisitesAreSatisfied() const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  for (ExpectationSet::const_iterator it = immediate_prerequisites_.begin();
+       it != immediate_prerequisites_.end(); ++it) {
+    if (!(it->expectation_base()->IsSatisfied()) ||
+        !(it->expectation_base()->AllPrerequisitesAreSatisfied()))
+      return false;
+  }
+  return true;
+}
+
+// Adds unsatisfied pre-requisites of this expectation to 'result'.
+void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  for (ExpectationSet::const_iterator it = immediate_prerequisites_.begin();
+       it != immediate_prerequisites_.end(); ++it) {
+    if (it->expectation_base()->IsSatisfied()) {
+      // If *it is satisfied and has a call count of 0, some of its
+      // pre-requisites may not be satisfied yet.
+      if (it->expectation_base()->call_count_ == 0) {
+        it->expectation_base()->FindUnsatisfiedPrerequisites(result);
+      }
+    } else {
+      // Now that we know *it is unsatisfied, we are not so interested
+      // in whether its pre-requisites are satisfied.  Therefore we
+      // don't recursively call FindUnsatisfiedPrerequisites() here.
+      *result += *it;
+    }
+  }
+}
+
+// Describes how many times a function call matching this
+// expectation has occurred.
+void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+
+  // Describes how many times the function is expected to be called.
+  *os << "         Expected: to be ";
+  cardinality().DescribeTo(os);
+  *os << "\n           Actual: ";
+  Cardinality::DescribeActualCallCountTo(call_count(), os);
+
+  // Describes the state of the expectation (e.g. is it satisfied?
+  // is it active?).
+  *os << " - " << (IsOverSaturated() ? "over-saturated" :
+                   IsSaturated() ? "saturated" :
+                   IsSatisfied() ? "satisfied" : "unsatisfied")
+      << " and "
+      << (is_retired() ? "retired" : "active");
+}
+
+// Checks the action count (i.e. the number of WillOnce() and
+// WillRepeatedly() clauses) against the cardinality if this hasn't
+// been done before.  Prints a warning if there are too many or too
+// few actions.
+void ExpectationBase::CheckActionCountIfNotDone() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  bool should_check = false;
+  {
+    MutexLock l(&mutex_);
+    if (!action_count_checked_) {
+      action_count_checked_ = true;
+      should_check = true;
+    }
+  }
+
+  if (should_check) {
+    if (!cardinality_specified_) {
+      // The cardinality was inferred - no need to check the action
+      // count against it.
+      return;
+    }
+
+    // The cardinality was explicitly specified.
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    const int upper_bound = cardinality().ConservativeUpperBound();
+    const int lower_bound = cardinality().ConservativeLowerBound();
+    bool too_many;  // True if there are too many actions, or false
+    // if there are too few.
+    if (action_count > upper_bound ||
+        (action_count == upper_bound && repeated_action_specified_)) {
+      too_many = true;
+    } else if (0 < action_count && action_count < lower_bound &&
+               !repeated_action_specified_) {
+      too_many = false;
+    } else {
+      return;
+    }
+
+    ::std::stringstream ss;
+    DescribeLocationTo(&ss);
+    ss << "Too " << (too_many ? "many" : "few")
+       << " actions specified in " << source_text() << "...\n"
+       << "Expected to be ";
+    cardinality().DescribeTo(&ss);
+    ss << ", but has " << (too_many ? "" : "only ")
+       << action_count << " WillOnce()"
+       << (action_count == 1 ? "" : "s");
+    if (repeated_action_specified_) {
+      ss << " and a WillRepeatedly()";
+    }
+    ss << ".";
+    Log(kWarning, ss.str(), -1);  // -1 means "don't print stack trace".
+  }
+}
+
+// Implements the .Times() clause.
+void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
+  if (last_clause_ == kTimes) {
+    ExpectSpecProperty(false,
+                       ".Times() cannot appear "
+                       "more than once in an EXPECT_CALL().");
+  } else {
+    ExpectSpecProperty(last_clause_ < kTimes,
+                       ".Times() cannot appear after "
+                       ".InSequence(), .WillOnce(), .WillRepeatedly(), "
+                       "or .RetiresOnSaturation().");
+  }
+  last_clause_ = kTimes;
+
+  SpecifyCardinality(a_cardinality);
+}
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const string& msg) {
+  // Include a stack trace only if --gmock_verbose=info is specified.
+  const int stack_frames_to_skip =
+      GMOCK_FLAG(verbose) == kInfoVerbosity ? 3 : -1;
+  switch (reaction) {
+    case kAllow:
+      Log(kInfo, msg, stack_frames_to_skip);
+      break;
+    case kWarn:
+      Log(kWarning,
+          msg +
+          "\nNOTE: You can safely ignore the above warning unless this "
+          "call should not happen.  Do not suppress it by blindly adding "
+          "an EXPECT_CALL() if you don't mean to enforce the call.  "
+          "See https://github.com/google/googletest/blob/master/googlemock/docs/CookBook.md#"
+          "knowing-when-to-expect for details.\n",
+          stack_frames_to_skip);
+      break;
+    default:  // FAIL
+      Expect(false, NULL, -1, msg);
+  }
+}
+
+UntypedFunctionMockerBase::UntypedFunctionMockerBase()
+    : mock_obj_(NULL), name_("") {}
+
+UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
+
+// Sets the mock object this mock method belongs to, and registers
+// this information in the global mock registry.  Will be called
+// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+// method.
+void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  {
+    MutexLock l(&g_gmock_mutex);
+    mock_obj_ = mock_obj;
+  }
+  Mock::Register(mock_obj, this);
+}
+
+// Sets the mock object this mock method belongs to, and sets the name
+// of the mock function.  Will be called upon each invocation of this
+// mock function.
+void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
+                                                const char* name)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // We protect name_ under g_gmock_mutex in case this mock function
+  // is called from two threads concurrently.
+  MutexLock l(&g_gmock_mutex);
+  mock_obj_ = mock_obj;
+  name_ = name;
+}
+
+// Returns the name of the function being mocked.  Must be called
+// after RegisterOwner() or SetOwnerAndName() has been called.
+const void* UntypedFunctionMockerBase::MockObject() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const void* mock_obj;
+  {
+    // We protect mock_obj_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(mock_obj_ != NULL, __FILE__, __LINE__,
+           "MockObject() must not be called before RegisterOwner() or "
+           "SetOwnerAndName() has been called.");
+    mock_obj = mock_obj_;
+  }
+  return mock_obj;
+}
+
+// Returns the name of this mock method.  Must be called after
+// SetOwnerAndName() has been called.
+const char* UntypedFunctionMockerBase::Name() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const char* name;
+  {
+    // We protect name_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(name_ != NULL, __FILE__, __LINE__,
+           "Name() must not be called before SetOwnerAndName() has "
+           "been called.");
+    name = name_;
+  }
+  return name;
+}
+
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.  The caller is responsible
+// for deleting the result.
+UntypedActionResultHolderBase*
+UntypedFunctionMockerBase::UntypedInvokeWith(const void* const untyped_args)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  if (untyped_expectations_.size() == 0) {
+    // No expectation is set on this mock method - we have an
+    // uninteresting call.
+
+    // We must get Google Mock's reaction on uninteresting calls
+    // made on this mock object BEFORE performing the action,
+    // because the action may DELETE the mock object and make the
+    // following expression meaningless.
+    const CallReaction reaction =
+        Mock::GetReactionOnUninterestingCalls(MockObject());
+
+    // True iff we need to print this call's arguments and return
+    // value.  This definition must be kept in sync with
+    // the behavior of ReportUninterestingCall().
+    const bool need_to_report_uninteresting_call =
+        // If the user allows this uninteresting call, we print it
+        // only when he wants informational messages.
+        reaction == kAllow ? LogIsVisible(kInfo) :
+        // If the user wants this to be a warning, we print it only
+        // when he wants to see warnings.
+        reaction == kWarn ? LogIsVisible(kWarning) :
+        // Otherwise, the user wants this to be an error, and we
+        // should always print detailed information in the error.
+        true;
+
+    if (!need_to_report_uninteresting_call) {
+      // Perform the action without printing the call information.
+      return this->UntypedPerformDefaultAction(untyped_args, "");
+    }
+
+    // Warns about the uninteresting call.
+    ::std::stringstream ss;
+    this->UntypedDescribeUninterestingCall(untyped_args, &ss);
+
+    // Calculates the function result.
+    UntypedActionResultHolderBase* const result =
+        this->UntypedPerformDefaultAction(untyped_args, ss.str());
+
+    // Prints the function result.
+    if (result != NULL)
+      result->PrintAsActionResult(&ss);
+
+    ReportUninterestingCall(reaction, ss.str());
+    return result;
+  }
+
+  bool is_excessive = false;
+  ::std::stringstream ss;
+  ::std::stringstream why;
+  ::std::stringstream loc;
+  const void* untyped_action = NULL;
+
+  // The UntypedFindMatchingExpectation() function acquires and
+  // releases g_gmock_mutex.
+  const ExpectationBase* const untyped_expectation =
+      this->UntypedFindMatchingExpectation(
+          untyped_args, &untyped_action, &is_excessive,
+          &ss, &why);
+  const bool found = untyped_expectation != NULL;
+
+  // True iff we need to print the call's arguments and return value.
+  // This definition must be kept in sync with the uses of Expect()
+  // and Log() in this function.
+  const bool need_to_report_call =
+      !found || is_excessive || LogIsVisible(kInfo);
+  if (!need_to_report_call) {
+    // Perform the action without printing the call information.
+    return
+        untyped_action == NULL ?
+        this->UntypedPerformDefaultAction(untyped_args, "") :
+        this->UntypedPerformAction(untyped_action, untyped_args);
+  }
+
+  ss << "    Function call: " << Name();
+  this->UntypedPrintArgs(untyped_args, &ss);
+
+  // In case the action deletes a piece of the expectation, we
+  // generate the message beforehand.
+  if (found && !is_excessive) {
+    untyped_expectation->DescribeLocationTo(&loc);
+  }
+
+  UntypedActionResultHolderBase* const result =
+      untyped_action == NULL ?
+      this->UntypedPerformDefaultAction(untyped_args, ss.str()) :
+      this->UntypedPerformAction(untyped_action, untyped_args);
+  if (result != NULL)
+    result->PrintAsActionResult(&ss);
+  ss << "\n" << why.str();
+
+  if (!found) {
+    // No expectation matches this call - reports a failure.
+    Expect(false, NULL, -1, ss.str());
+  } else if (is_excessive) {
+    // We had an upper-bound violation and the failure message is in ss.
+    Expect(false, untyped_expectation->file(),
+           untyped_expectation->line(), ss.str());
+  } else {
+    // We had an expected call and the matching expectation is
+    // described in ss.
+    Log(kInfo, loc.str() + ss.str(), 2);
+  }
+
+  return result;
+}
+
+// Returns an Expectation object that references and co-owns exp,
+// which must be an expectation on this mock function.
+Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    if (it->get() == exp) {
+      return Expectation(*it);
+    }
+  }
+
+  Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
+  return Expectation();
+  // The above statement is just to make the code compile, and will
+  // never be executed.
+}
+
+// Verifies that all expectations on this mock function have been
+// satisfied.  Reports one or more Google Test non-fatal failures
+// and returns false if not.
+bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  bool expectations_met = true;
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    ExpectationBase* const untyped_expectation = it->get();
+    if (untyped_expectation->IsOverSaturated()) {
+      // There was an upper-bound violation.  Since the error was
+      // already reported when it occurred, there is no need to do
+      // anything here.
+      expectations_met = false;
+    } else if (!untyped_expectation->IsSatisfied()) {
+      expectations_met = false;
+      ::std::stringstream ss;
+      ss  << "Actual function call count doesn't match "
+          << untyped_expectation->source_text() << "...\n";
+      // No need to show the source file location of the expectation
+      // in the description, as the Expect() call that follows already
+      // takes care of it.
+      untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
+      untyped_expectation->DescribeCallCountTo(&ss);
+      Expect(false, untyped_expectation->file(),
+             untyped_expectation->line(), ss.str());
+    }
+  }
+
+  // Deleting our expectations may trigger other mock objects to be deleted, for
+  // example if an action contains a reference counted smart pointer to that
+  // mock object, and that is the last reference. So if we delete our
+  // expectations within the context of the global mutex we may deadlock when
+  // this method is called again. Instead, make a copy of the set of
+  // expectations to delete, clear our set within the mutex, and then clear the
+  // copied set outside of it.
+  UntypedExpectations expectations_to_delete;
+  untyped_expectations_.swap(expectations_to_delete);
+
+  g_gmock_mutex.Unlock();
+  expectations_to_delete.clear();
+  g_gmock_mutex.Lock();
+
+  return expectations_met;
+}
+
+}  // namespace internal
+
+// Class Mock.
+
+namespace {
+
+typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
+
+// The current state of a mock object.  Such information is needed for
+// detecting leaked mock objects and explicitly verifying a mock's
+// expectations.
+struct MockObjectState {
+  MockObjectState()
+      : first_used_file(NULL), first_used_line(-1), leakable(false) {}
+
+  // Where in the source file an ON_CALL or EXPECT_CALL is first
+  // invoked on this mock object.
+  const char* first_used_file;
+  int first_used_line;
+  ::std::string first_used_test_case;
+  ::std::string first_used_test;
+  bool leakable;  // true iff it's OK to leak the object.
+  FunctionMockers function_mockers;  // All registered methods of the object.
+};
+
+// A global registry holding the state of all mock objects that are
+// alive.  A mock object is added to this registry the first time
+// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it.  It
+// is removed from the registry in the mock object's destructor.
+class MockObjectRegistry {
+ public:
+  // Maps a mock object (identified by its address) to its state.
+  typedef std::map<const void*, MockObjectState> StateMap;
+
+  // This destructor will be called when a program exits, after all
+  // tests in it have been run.  By then, there should be no mock
+  // object alive.  Therefore we report any living object as test
+  // failure, unless the user explicitly asked us to ignore it.
+  ~MockObjectRegistry() {
+    // "using ::std::cout;" doesn't work with Symbian's STLport, where cout is
+    // a macro.
+
+    if (!GMOCK_FLAG(catch_leaked_mocks))
+      return;
+
+    int leaked_count = 0;
+    for (StateMap::const_iterator it = states_.begin(); it != states_.end();
+         ++it) {
+      if (it->second.leakable)  // The user said it's fine to leak this object.
+        continue;
+
+      // TODO(wan@google.com): Print the type of the leaked object.
+      // This can help the user identify the leaked object.
+      std::cout << "\n";
+      const MockObjectState& state = it->second;
+      std::cout << internal::FormatFileLocation(state.first_used_file,
+                                                state.first_used_line);
+      std::cout << " ERROR: this mock object";
+      if (state.first_used_test != "") {
+        std::cout << " (used in test " << state.first_used_test_case << "."
+             << state.first_used_test << ")";
+      }
+      std::cout << " should be deleted but never is. Its address is @"
+           << it->first << ".";
+      leaked_count++;
+    }
+    if (leaked_count > 0) {
+      std::cout << "\nERROR: " << leaked_count
+           << " leaked mock " << (leaked_count == 1 ? "object" : "objects")
+           << " found at program exit.\n";
+      std::cout.flush();
+      ::std::cerr.flush();
+      // RUN_ALL_TESTS() has already returned when this destructor is
+      // called.  Therefore we cannot use the normal Google Test
+      // failure reporting mechanism.
+      _exit(1);  // We cannot call exit() as it is not reentrant and
+                 // may already have been called.
+    }
+  }
+
+  StateMap& states() { return states_; }
+
+ private:
+  StateMap states_;
+};
+
+// Protected by g_gmock_mutex.
+MockObjectRegistry g_mock_object_registry;
+
+// Maps a mock object to the reaction Google Mock should have when an
+// uninteresting method is called.  Protected by g_gmock_mutex.
+std::map<const void*, internal::CallReaction> g_uninteresting_call_reaction;
+
+// Sets the reaction Google Mock should have when an uninteresting
+// method of the given mock object is called.
+void SetReactionOnUninterestingCalls(const void* mock_obj,
+                                     internal::CallReaction reaction)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction[mock_obj] = reaction;
+}
+
+}  // namespace
+
+// Tells Google Mock to allow uninteresting calls on the given mock
+// object.
+void Mock::AllowUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
+}
+
+// Tells Google Mock to warn the user about uninteresting calls on the
+// given mock object.
+void Mock::WarnUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
+}
+
+// Tells Google Mock to fail uninteresting calls on the given mock
+// object.
+void Mock::FailUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
+}
+
+// Tells Google Mock the given mock object is being destroyed and its
+// entry in the call-reaction table should be removed.
+void Mock::UnregisterCallReaction(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction.erase(mock_obj);
+}
+
+// Returns the reaction Google Mock will have on uninteresting calls
+// made on the given mock object.
+internal::CallReaction Mock::GetReactionOnUninterestingCalls(
+    const void* mock_obj)
+        GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return (g_uninteresting_call_reaction.count(mock_obj) == 0) ?
+      internal::kDefault : g_uninteresting_call_reaction[mock_obj];
+}
+
+// Tells Google Mock to ignore mock_obj when checking for leaked mock
+// objects.
+void Mock::AllowLeak(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].leakable = true;
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectations(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies all expectations on the given mock object and clears its
+// default actions and expectations.  Returns true iff the
+// verification was successful.
+bool Mock::VerifyAndClear(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  ClearDefaultActionsLocked(mock_obj);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No EXPECT_CALL() was set on the given mock object.
+    return true;
+  }
+
+  // Verifies and clears the expectations on each mock method in the
+  // given mock object.
+  bool expectations_met = true;
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    if (!(*it)->VerifyAndClearExpectationsLocked()) {
+      expectations_met = false;
+    }
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by ClearDefaultActionsLocked().
+  return expectations_met;
+}
+
+// Registers a mock object and a mock method it owns.
+void Mock::Register(const void* mock_obj,
+                    internal::UntypedFunctionMockerBase* mocker)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
+}
+
+// Tells Google Mock where in the source code mock_obj is used in an
+// ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+// information helps the user identify which object it is.
+void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+                                           const char* file, int line)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  MockObjectState& state = g_mock_object_registry.states()[mock_obj];
+  if (state.first_used_file == NULL) {
+    state.first_used_file = file;
+    state.first_used_line = line;
+    const TestInfo* const test_info =
+        UnitTest::GetInstance()->current_test_info();
+    if (test_info != NULL) {
+      // TODO(wan@google.com): record the test case name when the
+      // ON_CALL or EXPECT_CALL is invoked from SetUpTestCase() or
+      // TearDownTestCase().
+      state.first_used_test_case = test_info->test_case_name();
+      state.first_used_test = test_info->name();
+    }
+  }
+}
+
+// Unregisters a mock method; removes the owning mock object from the
+// registry when the last mock method associated with it has been
+// unregistered.  This is called only in the destructor of
+// FunctionMockerBase.
+void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  for (MockObjectRegistry::StateMap::iterator it =
+           g_mock_object_registry.states().begin();
+       it != g_mock_object_registry.states().end(); ++it) {
+    FunctionMockers& mockers = it->second.function_mockers;
+    if (mockers.erase(mocker) > 0) {
+      // mocker was in mockers and has been just removed.
+      if (mockers.empty()) {
+        g_mock_object_registry.states().erase(it);
+      }
+      return;
+    }
+  }
+}
+
+// Clears all ON_CALL()s set on the given mock object.
+void Mock::ClearDefaultActionsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No ON_CALL() was set on the given mock object.
+    return;
+  }
+
+  // Clears the default actions for each mock method in the given mock
+  // object.
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    (*it)->ClearDefaultActionsLocked();
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by VerifyAndClearExpectationsLocked().
+}
+
+Expectation::Expectation() {}
+
+Expectation::Expectation(
+    const internal::linked_ptr<internal::ExpectationBase>& an_expectation_base)
+    : expectation_base_(an_expectation_base) {}
+
+Expectation::~Expectation() {}
+
+// Adds an expectation to a sequence.
+void Sequence::AddExpectation(const Expectation& expectation) const {
+  if (*last_expectation_ != expectation) {
+    if (last_expectation_->expectation_base() != NULL) {
+      expectation.expectation_base()->immediate_prerequisites_
+          += *last_expectation_;
+    }
+    *last_expectation_ = expectation;
+  }
+}
+
+// Creates the implicit sequence if there isn't one.
+InSequence::InSequence() {
+  if (internal::g_gmock_implicit_sequence.get() == NULL) {
+    internal::g_gmock_implicit_sequence.set(new Sequence);
+    sequence_created_ = true;
+  } else {
+    sequence_created_ = false;
+  }
+}
+
+// Deletes the implicit sequence if it was created by the constructor
+// of this object.
+InSequence::~InSequence() {
+  if (sequence_created_) {
+    delete internal::g_gmock_implicit_sequence.get();
+    internal::g_gmock_implicit_sequence.set(NULL);
+  }
+}
+
+}  // namespace testing
diff --git a/utils/unittest/googlemock/src/gmock.cc b/utils/unittest/googlemock/src/gmock.cc
new file mode 100644
index 000000000000..eac3d842ba07
--- /dev/null
+++ b/utils/unittest/googlemock/src/gmock.cc
@@ -0,0 +1,183 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+// TODO(wan@google.com): support using environment variables to
+// control the flag values, like what Google Test does.
+
+GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
+                   "true iff Google Mock should report leaked mock objects "
+                   "as failures.");
+
+GMOCK_DEFINE_string_(verbose, internal::kWarningVerbosity,
+                     "Controls how verbose Google Mock's output is."
+                     "  Valid values:\n"
+                     "  info    - prints all messages.\n"
+                     "  warning - prints warnings and errors.\n"
+                     "  error   - prints errors only.");
+
+namespace internal {
+
+// Parses a string as a command line flag.  The string should have the
+// format "--gmock_flag=value".  When def_optional is true, the
+// "=value" part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseGoogleMockFlagValue(const char* str,
+                                            const char* flag,
+                                            bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--gmock_".
+  const std::string flag_str = std::string("--gmock_") + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a Google Mock bool flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
+                                    bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for a Google Mock string flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
+                                      String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// The internal implementation of InitGoogleMock().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleMockImpl(int* argc, CharType** argv) {
+  // Makes sure Google Test is initialized.  InitGoogleTest() is
+  // idempotent, so it's fine if the user has already called it.
+  InitGoogleTest(argc, argv);
+  if (*argc <= 0) return;
+
+  for (int i = 1; i != *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    // Do we see a Google Mock flag?
+    if (ParseGoogleMockBoolFlag(arg, "catch_leaked_mocks",
+                                &GMOCK_FLAG(catch_leaked_mocks)) ||
+        ParseGoogleMockStringFlag(arg, "verbose", &GMOCK_FLAG(verbose))) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+}
+
+}  // namespace internal
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses a command line for the flags that
+// Google Mock recognizes.  Whenever a Google Mock flag is seen, it is
+// removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+}  // namespace testing
author	Dimitry Andric <dim@FreeBSD.org>	2017-01-14 15:37:50 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-01-14 15:37:50 +0000
commit	581a6d8501ff5614297da837b81ed3b6956361ea (patch)
tree	985ee91d0ca1d3e6506ac5ff7e37f5b67adfec09
parent	909545a822eef491158f831688066f0ec2866938 (diff)